From 5488fb789ea5692268309baa76f67598155060be Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jul 2023 15:56:53 +0200 Subject: [PATCH 01/70] ggml : allocate graphs in a context (#2392) * ggml : graph allocation in contexts * allocate work buffer as a ggml_object in ggml_graph_compute_with_ctx * llama.cpp : allocate graph in the context * add GGML_PAD --------- Co-authored-by: Georgi Gerganov --- ggml.c | 171 +++++++++++++++++++++++++++++++----------------------- ggml.h | 21 ++++++- llama.cpp | 30 +++++----- 3 files changed, 134 insertions(+), 88 deletions(-) diff --git a/ggml.c b/ggml.c index 35c56151b8f7c..33459f263657e 100644 --- a/ggml.c +++ b/ggml.c @@ -4071,8 +4071,8 @@ bool ggml_is_numa(void) { //////////////////////////////////////////////////////////////////////////////// void ggml_print_object(const struct ggml_object * obj) { - GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n", - obj->offs, obj->size, (const void *) obj->next); + GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n", + obj->type, obj->offs, obj->size, (const void *) obj->next); } void ggml_print_objects(const struct ggml_context * ctx) { @@ -4212,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { } size_t ggml_tensor_overhead(void) { - return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16; + return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE; } bool ggml_is_transposed(const struct ggml_tensor * tensor) { @@ -4383,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { return NULL; } - const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1); + const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN); *ctx = (struct ggml_context) { /*.mem_size =*/ mem_size, @@ -4472,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { struct ggml_object * obj = ctx->objects_begin; while (obj != NULL) { - struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); + if (obj->type == GGML_OBJECT_TENSOR) { + struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); - const size_t size = ggml_nbytes(tensor); + const size_t size = ggml_nbytes(tensor); - if (max_size < size) { - max_size = size; + if (max_size < size) { + max_size = size; + } } obj = obj->next; @@ -4509,12 +4511,7 @@ static void ggml_scratch_load(struct ggml_context * ctx) { //////////////////////////////////////////////////////////////////////////////// -static struct ggml_tensor * ggml_new_tensor_impl( - struct ggml_context * ctx, - enum ggml_type type, - int n_dims, - const int64_t* ne, - void* data) { +static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) { // always insert objects at the end of the context's memory pool struct ggml_object * obj_cur = ctx->objects_end; @@ -4522,77 +4519,79 @@ static struct ggml_tensor * ggml_new_tensor_impl( const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size; const size_t cur_end = cur_offs + cur_size; - size_t size_needed = 0; - - if (data == NULL && !ctx->no_alloc) { - size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]); - for (int i = 1; i < n_dims; i++) { - size_needed *= ne[i]; - } - // align to GGML_MEM_ALIGN - size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN; - } + // align to GGML_MEM_ALIGN + size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN); char * const mem_buffer = ctx->mem_buffer; struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); - if (ctx->scratch.data == NULL || data != NULL) { - size_needed += GGML_TENSOR_SIZE; + if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { + GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", + __func__, cur_end + size_needed, ctx->mem_size); + assert(false); + return NULL; + } - if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { - GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); - assert(false); - return NULL; - } + *obj_new = (struct ggml_object) { + .offs = cur_end + GGML_OBJECT_SIZE, + .size = size_needed, + .next = NULL, + .type = type, + }; - *obj_new = (struct ggml_object) { - .offs = cur_end + GGML_OBJECT_SIZE, - .size = size_needed, - .next = NULL, - }; + ggml_assert_aligned(mem_buffer + obj_new->offs); + + if (obj_cur != NULL) { + obj_cur->next = obj_new; } else { - if (ctx->scratch.offs + size_needed > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", - __func__, ctx->scratch.offs + size_needed, ctx->scratch.size); - assert(false); - return NULL; + // this is the first object in this context + ctx->objects_begin = obj_new; + } + + ctx->objects_end = obj_new; + + //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size); + + return obj_new; +} + +static struct ggml_tensor * ggml_new_tensor_impl( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t* ne, + void* data) { + + size_t data_size = 0; + + if (data == NULL && !ctx->no_alloc) { + data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]); + for (int i = 1; i < n_dims; i++) { + data_size *= ne[i]; } + } - if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) { - GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size); + if (ctx->scratch.data != NULL && data == NULL) { + // allocate tensor data in the scratch buffer + if (ctx->scratch.offs + data_size > ctx->scratch.size) { + GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + __func__, ctx->scratch.offs + data_size, ctx->scratch.size); assert(false); return NULL; } data = (char * const) ctx->scratch.data + ctx->scratch.offs; - *obj_new = (struct ggml_object) { - .offs = cur_end + GGML_OBJECT_SIZE, - .size = GGML_TENSOR_SIZE, - .next = NULL, - }; - - //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed); + ctx->scratch.offs += data_size; - ctx->scratch.offs += size_needed; + data_size = 0; } - if (obj_cur != NULL) { - obj_cur->next = obj_new; - } else { - // this is the first object in this context - ctx->objects_begin = obj_new; - } - - ctx->objects_end = obj_new; - - //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size); + struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size); - struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs); + // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here - ggml_assert_aligned(result); + struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); *result = (struct ggml_tensor) { /*.type =*/ type, @@ -5026,9 +5025,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); - if (strcmp(cur->name, name) == 0) { - return cur; + if (obj->type == GGML_OBJECT_TENSOR) { + struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); + if (strcmp(cur->name, name) == 0) { + return cur; + } } obj = obj->next; @@ -15829,6 +15830,35 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg return result; } +struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE); + struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); + + *cgraph = (struct ggml_cgraph) { + /*.n_nodes =*/ 0, + /*.n_leafs =*/ 0, + /*.nodes =*/ { NULL }, + /*.grads =*/ { NULL }, + /*.leafs =*/ { NULL }, + /*.hash_table =*/ { NULL }, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + }; + + return cgraph; +} + +struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) { + struct ggml_cgraph * cgraph = ggml_new_graph(ctx); + ggml_build_forward_impl(cgraph, tensor, false); + return cgraph; +} + +size_t ggml_graph_overhead(void) { + return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN); +} + // // thread data // @@ -16544,10 +16574,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); - struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size); - GGML_ASSERT(buf); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); - cplan.work_data = buf->data; + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; ggml_graph_compute(cgraph, &cplan); } diff --git a/ggml.h b/ggml.h index c309f1361c6f6..9919cce7c263f 100644 --- a/ggml.h +++ b/ggml.h @@ -208,6 +208,7 @@ #define GGML_UNUSED(x) (void)(x) +#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) #define GGML_ASSERT(x) \ do { \ @@ -396,6 +397,12 @@ extern "C" { GGML_UNARY_OP_SILU, }; + enum ggml_object_type { + GGML_OBJECT_TENSOR, + GGML_OBJECT_GRAPH, + GGML_OBJECT_WORK_BUFFER + }; + // ggml object struct ggml_object { size_t offs; @@ -403,7 +410,9 @@ extern "C" { struct ggml_object * next; - char padding[8]; + enum ggml_object_type type; + + char padding[4]; }; static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); @@ -424,7 +433,7 @@ extern "C" { enum ggml_op op; // op params - allocated as int32_t for alignment - int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)]; + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; bool is_param; @@ -485,6 +494,8 @@ extern "C" { int64_t perf_time_us; }; + static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph); + // scratch buffer struct ggml_scratch { size_t offs; @@ -1391,11 +1402,17 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); + // graph allocation in a context + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); + GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API size_t ggml_graph_overhead(void); + // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); diff --git a/llama.cpp b/llama.cpp index 30d4b0a6e6c7c..024af99a563ac 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1424,7 +1424,7 @@ static bool llama_eval_internal( struct ggml_context * ctx0 = ggml_init(params); - ggml_cgraph gf = {}; + ggml_cgraph * gf = ggml_new_graph(ctx0); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance @@ -1541,8 +1541,8 @@ static bool llama_eval_internal( ggml_set_name(v, "v"); // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = @@ -1712,21 +1712,21 @@ static bool llama_eval_internal( //cur = ggml_soft_max_inplace(ctx0, cur); // run the computation - ggml_build_forward_expand(&gf, cur); + ggml_build_forward_expand(gf, cur); // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); #if GGML_USE_MPI - ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer); + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { if (!ggml_metal_if_optimized(lctx.ctx_metal)) { - ggml_metal_graph_find_concurrency(lctx.ctx_metal,&gf); + ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf); } ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); - ggml_metal_graph_compute(lctx.ctx_metal, &gf); + ggml_metal_graph_compute(lctx.ctx_metal, gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); } else { // IMPORTANT: @@ -1745,34 +1745,34 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); + ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); } #else - ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); + ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); #endif #if GGML_USE_MPI - ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer); + ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); #endif // update kv token count lctx.kv_self.n = n_past + N; - struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1]; + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; if (cgraph_fname) { - ggml_graph_export(&gf, cgraph_fname); + ggml_graph_export(gf, cgraph_fname); } #ifdef GGML_PERF // print timing information per ggml operation (for debugging purposes) // requires GGML_PERF to be defined - ggml_graph_print(&gf); + ggml_graph_print(gf); #endif // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { - // ggml_graph_dump_dot(&gf, NULL, "llama.dot"); + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} // extract logits @@ -3177,7 +3177,7 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); From 6df1f5940f889adde32fe47dc8881f010dcf9aba Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Wed, 26 Jul 2023 14:00:04 -0400 Subject: [PATCH 02/70] make : build with -Wmissing-prototypes (#2394) --- CMakeLists.txt | 1 + Makefile | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index abc96814d632d..c43e65e746320 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -357,6 +357,7 @@ if (LLAMA_ALL_WARNINGS) -Wshadow -Wstrict-prototypes -Wpointer-arith + -Wmissing-prototypes ) set(cxx_flags -Wall diff --git a/Makefile b/Makefile index fb7c27cd972bb..2035c525338a7 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,8 @@ ifdef LLAMA_SERVER_VERBOSE endif # warnings -CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith +CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ + -Wmissing-prototypes CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar # OS specific From b5472ea0ada081a6e1c06998ebbc9a24aa2cd4a4 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jul 2023 23:57:23 +0200 Subject: [PATCH 03/70] ggml : fix assert in ggml_set_unary_op (#2410) --- ggml.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ggml.c b/ggml.c index 33459f263657e..b77f9926754ed 100644 --- a/ggml.c +++ b/ggml.c @@ -4982,11 +4982,6 @@ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } -static void ggml_set_unary_op(struct ggml_tensor * tensor, enum ggml_unary_op op) { - GGML_ASSERT(tensor->op = GGML_OP_UNARY); - ggml_set_op_params_i32(tensor, 0, (int32_t) op); -} - const char * ggml_get_name(const struct ggml_tensor * tensor) { return tensor->name; } @@ -7226,7 +7221,7 @@ static struct ggml_tensor * ggml_unary_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_set_unary_op(result, op); + ggml_set_op_params_i32(result, 0, (int32_t) op); result->op = GGML_OP_UNARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; From 1a941869cbef8e9cc351a6c6987e4ae3b0f021f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 27 Jul 2023 11:00:54 +0300 Subject: [PATCH 04/70] metal : disable graph concurrency optimization due to bug (#2413) --- llama.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 024af99a563ac..9a8ecdcf6f7f1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1722,9 +1722,10 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { - if (!ggml_metal_if_optimized(lctx.ctx_metal)) { - ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf); - } + // TODO: disabled until #2413 is resolved + //if (!ggml_metal_if_optimized(lctx.ctx_metal)) { + // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf); + //} ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); From 7c529cede6e84054e77a3eceab31c53de7b2f55b Mon Sep 17 00:00:00 2001 From: mj-shifu <77107165+mj-shifu@users.noreply.github.com> Date: Thu, 27 Jul 2023 22:39:17 +0200 Subject: [PATCH 05/70] convert.py : Update to support 70B HF format model files (#2427) * convert.py : fix llama 2 70b conversion from Huggingface --- convert.py | 96 +++++++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 44 deletions(-) mode change 100755 => 100644 convert.py diff --git a/convert.py b/convert.py old mode 100755 new mode 100644 index ac99579c49eb2..ab6a4e10e49c3 --- a/convert.py +++ b/convert.py @@ -133,7 +133,7 @@ def make_tensors_list() -> List[str]: def find_n_mult(n_ff: int, n_embd: int) -> int: # hardcoded magic range - for n_mult in range(256, 1, -1): + for n_mult in range(8192, 1, -1): calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult if calc_ff == n_ff: return n_mult @@ -141,11 +141,12 @@ def find_n_mult(n_ff: int, n_embd: int) -> int: @dataclass class Params: - n_vocab: int - n_embd: int - n_mult: int - n_head: int - n_layer: int + n_vocab: int + n_embd: int + n_mult: int + n_head: int + n_layer: int + n_kv_head: Optional[int] # This parameter is only used for Llama 2 @staticmethod def guessed(model: 'LazyModel') -> 'Params': @@ -167,11 +168,12 @@ def guessed(model: 'LazyModel') -> 'Params': n_head=n_embd // 128 # guessed return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = 256, - n_head = n_head, - n_layer = n_layer, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = 256, + n_head = n_head, + n_layer = n_layer, + n_kv_head = None, ) @staticmethod @@ -183,15 +185,17 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': n_head = config["num_attention_heads"]; n_layer = config["num_hidden_layers"]; n_ff = config["intermediate_size"]; + n_kv_head = config.get("num_key_value_heads") n_mult = find_n_mult(n_ff, n_embd); return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = n_mult, - n_head = n_head, - n_layer = n_layer, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = n_mult, + n_head = n_head, + n_layer = n_layer, + n_kv_head = n_kv_head, ) # LLaMA v2 70B params.json @@ -200,21 +204,22 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params': config = json.load(open(config_path)) - n_vocab = config["vocab_size"]; - n_embd = config["dim"]; - n_head = config["n_heads"]; - n_layer = config["n_layers"]; - n_mult = config["multiple_of"]; + n_vocab = config["vocab_size"]; + n_embd = config["dim"]; + n_head = config["n_heads"]; + n_layer = config["n_layers"]; + n_mult = config["multiple_of"]; if n_vocab == -1: n_vocab = model["tok_embeddings.weight"].shape[0] return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = n_mult, - n_head = n_head, - n_layer = n_layer, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = n_mult, + n_head = n_head, + n_layer = n_layer, + n_kv_head = None, ) @staticmethod @@ -317,10 +322,12 @@ def __repr__(self) -> str: Vocab = Union[SentencePieceVocab, GGMLVocab] -def permute(weights: NDArray, n_head: int) -> NDArray: +def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + .swapaxes(1, 2) + .reshape(weights.shape)) def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray: @@ -368,7 +375,7 @@ class Tensor(metaclass=ABCMeta): @abstractmethod def astype(self, data_type: DataType) -> 'Tensor': ... @abstractmethod - def permute(self, n_head: int) -> 'Tensor': ... + def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ... @abstractmethod def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ... @abstractmethod @@ -406,8 +413,8 @@ def part(self, n_part: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) - def permute(self, n_head: int) -> 'UnquantizedTensor': - return UnquantizedTensor(permute(self.ndarray, n_head)) + def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor': + return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head)) def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray: @@ -455,26 +462,27 @@ def astype(self, data_type: DataType) -> Tensor: def to_ggml(self) -> 'GGMLQuantizedTensor': return self - def permute(self, n_head: int) -> 'GGMLQuantizedTensor': - return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type) + def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor': + return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type) GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor] class DeferredPermutedTensor(Tensor): - def __init__(self, base: Tensor, n_head: int) -> None: + def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None: self.base = base self.n_head = n_head + self.n_kv_head = n_kv_head self.data_type = self.base.data_type def astype(self, data_type: DataType) -> Tensor: - return self.base.astype(data_type).permute(self.n_head) + return self.base.astype(data_type).permute(self.n_head, self.n_kv_head) def to_ggml(self) -> GGMLCompatibleTensor: - return self.base.to_ggml().permute(self.n_head) + return self.base.to_ggml().permute(self.n_head, self.n_kv_head) - def permute(self, n_head: int) -> Tensor: + def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor: raise Exception("shouldn't permute twice") @@ -566,8 +574,8 @@ def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor': ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False) return ret - def permute(self, n_head: int) -> Tensor: - return DeferredPermutedTensor(self, n_head) + def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor: + return DeferredPermutedTensor(self, n_head, n_kv_head) def to_ggml(self) -> GGMLQuantizedTensor: # The output format looks like this: @@ -698,10 +706,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus: return ModelPlus(model, paths, format, vocab) -def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor: +def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor: def load() -> Tensor: - return lazy_tensor.load().permute(n_head) - return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) + return lazy_tensor.load().permute(n_head, n_kv_head) + return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description) def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor: def load() -> Tensor: @@ -726,7 +734,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel: for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" in model: out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head) - out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head) + out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head) out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head) From edcc7ae7d26007bbf83136e9d33f863fcad9b871 Mon Sep 17 00:00:00 2001 From: niansa/tuxifan Date: Fri, 28 Jul 2023 03:14:11 +0200 Subject: [PATCH 06/70] Obtaining LLaMA 2 instructions (#2308) * Obtaining LLaMA 2 instructions * Removed sharing warning for LLaMA 2 * Linked TheBloke's GGML repos * Add LLaMA 2 to list of supported models * Added LLaMA 2 usage instructions * Added links to LLaMA 2 70B models --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index a0e0ea2e01262..6a3268d129b55 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ as the main playground for developing new features for the [ggml](https://github **Supported models:** - [X] LLaMA 🦙 +- [x] LLaMA 2 🦙🦙 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) @@ -650,6 +651,19 @@ python3 convert.py pygmalion-7b/ --outtype q4_1 - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. +### Obtaining and using the Facebook LLaMA 2 model + +- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data. +- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including: + - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML) + - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML) + - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML) + - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML) + - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML) + - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML) +- Specify `-eps 1e-5` for best generation quality +- Specify `-gqa 8` for 70B models to work + ### Verifying the model files Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. From 65cdf34bdc469fa86248e667a5880992684ef114 Mon Sep 17 00:00:00 2001 From: Rand Xie Date: Fri, 28 Jul 2023 01:42:53 -0700 Subject: [PATCH 07/70] llama : use n_embd_gqa instead of n_embd to handle llama-2 70B (#2433) --- examples/save-load-state/save-load-state.cpp | 1 + llama.cpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 4c868850317fe..61c71c3589fdf 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -26,6 +26,7 @@ int main(int argc, char ** argv) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; + lparams.n_gqa = params.n_gqa; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.use_mmap = params.use_mmap; diff --git a/llama.cpp b/llama.cpp index 9a8ecdcf6f7f1..a4489773f96c5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3663,7 +3663,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd; + const int n_embd = hparams.n_embd_gqa(); const int n_ctx = hparams.n_ctx; const size_t kv_size = kv_self.buf.size; @@ -3766,7 +3766,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd; + const int n_embd = hparams.n_embd_gqa(); const int n_ctx = hparams.n_ctx; size_t kv_size; From d91f3f0c55663719ea03b76311e8c36ed55eb0e2 Mon Sep 17 00:00:00 2001 From: Weird Constructor Date: Fri, 28 Jul 2023 10:44:43 +0200 Subject: [PATCH 08/70] readme : fix the description of the Tail free sampling (TFS) method (#2431) --- examples/main/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/main/README.md b/examples/main/README.md index 37538613042b0..014112e5db484 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -202,9 +202,9 @@ Example usage: `--top-p 0.95` - `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). -Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced. +Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS. -Example usage: `--tfs 2.0` +Example usage: `--tfs 0.95` ### Locally Typical Sampling From 34ae1caf7fdea4c4c09087002e15e6230102e6a9 Mon Sep 17 00:00:00 2001 From: nhamanasu <45545786+nhamanasu@users.noreply.github.com> Date: Sat, 29 Jul 2023 03:02:10 +0900 Subject: [PATCH 09/70] examples : server chat mode with llama2 (#2400) * add: server chat mode with llama2 * fix: remove the unnecessary last \n --- examples/server-llama2-13B.sh | 26 ++++++++ examples/server/chat-llama2.sh | 109 +++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 examples/server-llama2-13B.sh create mode 100644 examples/server/chat-llama2.sh diff --git a/examples/server-llama2-13B.sh b/examples/server-llama2-13B.sh new file mode 100644 index 0000000000000..d7f3698596c41 --- /dev/null +++ b/examples/server-llama2-13B.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +cd "$(dirname "$0")/.." || exit + +# Specify the model you want to use here: +MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}" +PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt} + +# Adjust to the number of CPU cores you want to use. +N_THREAD="${N_THREAD:-12}" + +# Note: you can also override the generation options by specifying them on the command line: +GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}" + + +# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS +./server $GEN_OPTIONS \ + --model "$MODEL" \ + --threads "$N_THREAD" \ + --rope-freq-scale 1.0 \ + "$@" + + # I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line. + # -ngl 1 \ diff --git a/examples/server/chat-llama2.sh b/examples/server/chat-llama2.sh new file mode 100644 index 0000000000000..1fc79b7e19137 --- /dev/null +++ b/examples/server/chat-llama2.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +API_URL="${API_URL:-http://127.0.0.1:8080}" + +CHAT=( + "Hello, Assistant." + "Hello. How may I help you today?" +) + +INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + +trim() { + shopt -s extglob + set -- "${1##+([[:space:]])}" + printf "%s" "${1%%+([[:space:]])}" +} + +trim_trailing() { + shopt -s extglob + printf "%s" "${1%%+([[:space:]])}" +} + +format_prompt() { + if [[ "${#CHAT[@]}" -eq 0 ]]; then + echo -n "[INST] <>\n${INSTRUCTION}\n<>" + else + LAST_INDEX=$(( ${#CHAT[@]} - 1 )) + echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]" + fi +} + +tokenize() { + curl \ + --silent \ + --request POST \ + --url "${API_URL}/tokenize" \ + --header "Content-Type: application/json" \ + --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ + | jq '.tokens[]' +} + +N_KEEP=$(tokenize "[INST] <>\n${INSTRUCTION}\n<>" | wc -l) + +chat_completion() { + PROMPT="$(trim_trailing "$(format_prompt "$1")")" + DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{ + prompt: ., + temperature: 0.2, + top_k: 40, + top_p: 0.9, + n_keep: $n_keep, + n_predict: 1024, + stop: ["[INST]"], + stream: true + }')" + + # Create a temporary file to hold the Python output + TEMPFILE=$(mktemp) + + exec 3< <(curl \ + --silent \ + --no-buffer \ + --request POST \ + --url "${API_URL}/completion" \ + --header "Content-Type: application/json" \ + --data-raw "${DATA}") + + python -c " +import json +import sys + +answer = '' +while True: + line = sys.stdin.readline() + if not line: + break + if line.startswith('data: '): + json_content = line[6:].strip() + content = json.loads(json_content)['content'] + sys.stdout.write(content) + sys.stdout.flush() + answer += content + +answer = answer.rstrip('\n') + +# Write the answer to the temporary file +with open('$TEMPFILE', 'w') as f: + f.write(answer) + " <&3 + + exec 3<&- + + # Read the answer from the temporary file + ANSWER=$(cat $TEMPFILE) + + # Clean up the temporary file + rm $TEMPFILE + + printf "\n" + + CHAT+=("$1" "$(trim "$ANSWER")") +} + +while true; do + echo -en "\033[0;32m" # Green color + read -r -e -p "> " QUESTION + echo -en "\033[0m" # Reset color + chat_completion "${QUESTION}" +done From d73b8d48b45d6e2c0ae9bb8c39623c4024adc275 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 28 Jul 2023 21:05:08 +0300 Subject: [PATCH 10/70] examples : fix whitespace --- examples/server-llama2-13B.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server-llama2-13B.sh b/examples/server-llama2-13B.sh index d7f3698596c41..17fedc2b176f6 100644 --- a/examples/server-llama2-13B.sh +++ b/examples/server-llama2-13B.sh @@ -21,6 +21,6 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}" --threads "$N_THREAD" \ --rope-freq-scale 1.0 \ "$@" - - # I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line. - # -ngl 1 \ + +# I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line. +# -ngl 1 \ From ee1b497c985f61d6ec519c39fcfed78a3c6f1d06 Mon Sep 17 00:00:00 2001 From: eric8607242 Date: Sat, 29 Jul 2023 02:10:05 +0800 Subject: [PATCH 11/70] llama : support more diverse tokenizers? (#2420) * supporting more diverse tokenizers * Update llama.cpp --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a4489773f96c5..a35c690ea9f68 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1924,7 +1924,9 @@ struct llama_tokenizer { if (token == vocab_.token_to_id.end()) { // output any symbols that did not form tokens as bytes. for (int j = 0; j < (int) symbol.n; ++j) { - llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; + // NOTE: old version, before #2420 - not sure what are the implications of this + //llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; + llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j])); output.push_back(token_id); } } else { From a9559bf77b903d94eb21614ceae5ae8950f0f1fc Mon Sep 17 00:00:00 2001 From: Lee <44310445+lx200916@users.noreply.github.com> Date: Sat, 29 Jul 2023 02:17:45 +0800 Subject: [PATCH 12/70] ggml : workaround for missing _mm256_setr_m128i in GCC < 8 in k_quants.c (#2405) --- k_quants.c | 62 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/k_quants.c b/k_quants.c index e792c57acc7b7..6348fce6b94d0 100644 --- a/k_quants.c +++ b/k_quants.c @@ -39,6 +39,8 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) + // // 2-6 bit quantization in super-blocks // @@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const __m256i all_scales = _mm256_cvtepi8_epi16(scales8); const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); - const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)}; + const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; __m256i sumi = _mm256_setzero_si256(); @@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8])); // sumf += -dmin * summs in 32bits*8 - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc); const __m128i scales_0 = _mm_cvtepi8_epi16(scales16); const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16)); @@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri } // sumf += dall * isum - dmin * summs in 32bits - __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc); } @@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri summs += dmin * smin; const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); - const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3); - const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3); + const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3); + const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3); const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); @@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0)); const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1)); - const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0)); - const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1)); - const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2)); - const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3)); + const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0)); + const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1)); + const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2)); + const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3)); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc); @@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); - const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)}; + const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; // high bit const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask); @@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri } // multiply with block scale and accumulate - __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); } @@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri aux16[0] = a & 0x0f0f; aux16[1] = (a >> 4) & 0x0f0f; - const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8)); - const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8)); + const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8)); + const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8)); memcpy(&aux64, x[i].hmask, 8); const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0); - __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux); + __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux); __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4); q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2); q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2); @@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3); // prepare low and high bits - const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits); + const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits); const __m256i q3l_0 = _mm256_and_si256(q3aux, m3); const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3); @@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri p16_0 = _mm_add_epi32(p16_0, p16_2); p16_1 = _mm_add_epi32(p16_1, p16_3); - __m256i p16 = _mm256_set_m128i(p16_1, p16_0); + __m256i p16 = MM256_SET_M128I(p16_1, p16_0); // multiply with block scale and accumulate acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc); @@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m); const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); - const __m256i scales = _mm256_set_m128i(sc128, sc128); + const __m256i scales = MM256_SET_M128I(sc128, sc128); __m256i sumi = _mm256_setzero_si256(); @@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri } __m256 vd = _mm256_set1_ps(d); - __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); } @@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0); const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1); - acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc); + acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc); const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2); const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3); - acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc); + acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc); } @@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri summs += dmin * _mm_extract_epi32(hsum, 0); const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); - const __m256i scales = _mm256_set_m128i(sc128, sc128); + const __m256i scales = MM256_SET_M128I(sc128, sc128); const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh); __m256i hmask = mone; @@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri } __m256 vd = _mm256_set1_ps(d); - __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); } @@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); - const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0])); - const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2])); + const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0])); + const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2])); int64_t aux64; memcpy(&aux64, x[i].qh, 8); const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64); - const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128); + const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128); const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4); const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4); @@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2)); const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3)); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc); } @@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri } - __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); } @@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh); - const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4); - const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4); + const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4); + const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4); const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1); @@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc); } *s = hsum_float_8(acc); From 8a88e5855c3b93024be0f93290b01a4206b65b38 Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Fri, 28 Jul 2023 20:25:36 +0200 Subject: [PATCH 13/70] perplexity : add Hellaswag calculation (#2389) * common.h : add hellaswag / remove perplexity-lines * common.cpp : add hellaswag / remove perplexity-lines * perplexity.cpp : add hellswag scores / remove perplexity-lines * perplexity.cpp : clean up * common.h : change default param value * common.cpp : Change default param * perplexity.cpp : alter wording * common.h : alter wording * common.cpp : alter wording --- examples/common.cpp | 15 ++- examples/common.h | 6 +- examples/perplexity/perplexity.cpp | 179 ++++++++++++++++++++++------- 3 files changed, 155 insertions(+), 45 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index dd964c8a7481a..fe7308b1787eb 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -402,8 +402,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.antiprompt.push_back(argv[i]); } else if (arg == "--perplexity") { params.perplexity = true; - } else if (arg == "--perplexity-lines") { - params.perplexity_lines = true; + } else if (arg == "--hellaswag") { + params.hellaswag = true; + } else if (arg == "--hellaswag-tasks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.hellaswag_tasks = std::stoi(argv[i]); } else if (arg == "--ignore-eos") { params.logit_bias[llama_token_eos()] = -INFINITY; } else if (arg == "--no-penalize-nl") { @@ -559,8 +565,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); - fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n"); - fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); + fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %d)\n", params.hellaswag_tasks); + fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); diff --git a/examples/common.h b/examples/common.h index 672dcf77c8f99..1184f32df5074 100644 --- a/examples/common.h +++ b/examples/common.h @@ -70,7 +70,10 @@ struct gpt_params { std::string lora_adapter = ""; // lora adapter path std::string lora_base = ""; // base model path for the lora adapter - bool low_vram = false; // if true, reduce VRAM usage at the cost of performance + bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score + + bool low_vram = false; // if true, reduce VRAM usage at the cost of performance bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs @@ -86,7 +89,6 @@ struct gpt_params { bool instruct = false; // instruction mode (used for Alpaca models) bool penalize_nl = true; // consider newlines as a repeatable token bool perplexity = false; // compute perplexity over the prompt - bool perplexity_lines = false; // compute perplexity over each line of the prompt bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index d23b7e7f0c1b8..6870a11b931db 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -121,8 +121,23 @@ void perplexity(llama_context * ctx, const gpt_params & params) { printf("\n"); } -void perplexity_lines(llama_context * ctx, const gpt_params & params) { - // Calculates perplexity over each line of the prompt +void hellaswag_score(llama_context * ctx, const gpt_params & params) { + // Calculates hellaswag score (acc_norm) from prompt + // + // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl + // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68 + // + // All 10042 tasks should be extracted to keep the results standardized like other implementations. + // + // Datafile layout: + // ['??'] denotes json fields + // 6 lines per task: + // ['activity_label'] + ": " +['ctx'] - The first part of the query, the context + // ['label'] - The index the best common sense ending aka gold ending + // ['endings'][0] - Endings added to the first part of the query + // ['endings'][1] + // ['endings'][2] + // ['endings'][3] std::vector prompt_lines; std::istringstream strstream(params.prompt); @@ -132,63 +147,149 @@ void perplexity_lines(llama_context * ctx, const gpt_params & params) { prompt_lines.push_back(line); } - const int n_vocab = llama_n_vocab(ctx); + if( prompt_lines.size() % 6 != 0) { + fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__); + return; + } - int counttotal = 0; - size_t n_lines = prompt_lines.size(); + size_t hs_task_count = prompt_lines.size()/6; + fprintf(stderr, "%s : loaded %lu tasks from prompt.\n", __func__, hs_task_count); - double nll = 0.0; + // This is needed as usual for LLaMA models + bool prepend_bos = true; + + // Number of tasks to use when computing the score + if ( params.hellaswag_tasks < hs_task_count ) { + hs_task_count = params.hellaswag_tasks; + } - fprintf(stderr, "%s: calculating perplexity over %lu lines\n", __func__, n_lines); + // The tasks should be randomized so the score stabilizes quickly. + bool randomize_tasks = true; - printf("\nLine\tPPL line\tPPL cumulative\n"); + // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now + std::mt19937 rng(1); - for (size_t i = 0; i < n_lines; ++i) { + // Dataholder for hellaswag tasks + struct hs_data_t { + std::string context; + size_t gold_ending_idx; + std::string ending[4]; + size_t ending_logprob_count[4]; + double ending_logprob[4]; + }; - // Tokenize and insert BOS at start - std::vector batch_embd = ::llama_tokenize(ctx, prompt_lines[i], true); + fprintf(stderr, "%s : selecting %lu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); - size_t batch_size = batch_embd.size(); + // Select and read data from prompt lines + hs_data_t *hs_data = new hs_data_t[hs_task_count]; + for (size_t i=0; i < hs_task_count; i++) { + size_t idx = i; - // Stop if line is too long - if( batch_size > (size_t)params.n_ctx ) { - fprintf(stderr, "%s : tokens in line %lu > n_ctxl\n", __func__, i); - return; + // Select a random example of those left in the prompt + if (randomize_tasks) { + std::uniform_int_distribution dist(0, prompt_lines.size()/6-1 ) ; + idx = dist(rng); } - if (llama_eval(ctx, batch_embd.data(), batch_size, 0, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; + hs_data[i].context = prompt_lines[idx*6]; + hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); + for (size_t j=0; j < 4; j++) { + hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j]; } - const auto batch_logits = llama_get_logits(ctx); - std::vector logits; - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + // Delete the selected random example from the prompt + if (randomize_tasks) { + prompt_lines.erase( std::next(prompt_lines.begin(),idx*6) , std::next(prompt_lines.begin(),idx*6+6) ); + } + } - double nllline = 0.0; - int countline = 0; + fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__); + printf("\ntask\tacc_norm\n"); - // Perplexity over second half of the line - for (size_t j = batch_size/2; j < batch_size - 1; ++j) { - // Calculate probability of next token, given the previous ones. - const std::vector tok_logits( - logits.begin() + (j + 0) * n_vocab, - logits.begin() + (j + 1) * n_vocab); + double acc = 0.0f; + const int n_vocab = llama_n_vocab(ctx); + + for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { + + // Tokenize the context to count tokens + std::vector context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos); + size_t context_size = context_embd.size(); + + for (size_t ending_idx=0;ending_idx<4;ending_idx++) { + + // Tokenize the query + std::vector query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos); + size_t query_size = query_embd.size(); + + // Stop if query wont fit the ctx window + if (query_size > (size_t)params.n_ctx) { + fprintf(stderr, "%s : number of tokens in query %lu > n_ctxl\n", __func__, query_size); + return; + } - const float prob = softmax(tok_logits)[batch_embd[ j + 1]]; + // Speedup small evaluations by evaluating atleast 32 tokens + if (query_size < 32) { + query_embd.resize(32); + } + + // Evaluate the query + if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + + const auto query_logits = llama_get_logits(ctx); + std::vector logits; + logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab); + + hs_data[task_idx].ending_logprob_count[ending_idx] = 0; + hs_data[task_idx].ending_logprob[ending_idx] = 0.0f; + + // Calculate the logprobs over the ending + for (size_t j = context_size-1; j < query_size - 1; j++) { + // Calculate probability of next token, given the previous ones. + const std::vector tok_logits( + logits.begin() + (j + 0) * n_vocab, + logits.begin() + (j + 1) * n_vocab); + + const float prob = softmax(tok_logits)[query_embd[ j + 1]]; + + hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob); + hs_data[task_idx].ending_logprob_count[ending_idx]++; + } + + // Calculate the mean token logprob for acc_norm + hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx]; + + +// printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n", +// task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] ); + } - nllline += -std::log(prob); - ++countline; + // Find the ending with maximum logprob + size_t ending_logprob_max_idx = -1; + double ending_logprob_max_val = -INFINITY; + for (size_t j=0; j < 4; j++) { + if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) { + ending_logprob_max_idx = j; + ending_logprob_max_val = hs_data[task_idx].ending_logprob[j]; + } } - nll += nllline; - counttotal += countline; +// printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx); - // perplexity is e^(average negative log-likelihood) - printf("%lu\t%.8lf\t%.8lf\n", i + 1, std::exp(nllline/countline), std::exp(nll / counttotal) ); + // If the gold ending got the maximum logprobe add one accuracy point + if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) { + acc += 1.0; + } + + // Print the accumulated accuracy mean x 100 + printf("%li\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0); fflush(stdout); } + delete [] hs_data; + printf("\n"); } @@ -240,8 +341,8 @@ int main(int argc, char ** argv) { params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } - if (params.perplexity_lines) { - perplexity_lines(ctx, params); + if (params.hellaswag) { + hellaswag_score(ctx, params); } else { perplexity(ctx, params); } From e4b42e5b1543aa2b906debb70348a586a5f27c71 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 29 Jul 2023 11:15:57 +0800 Subject: [PATCH 14/70] fixed gui bugs --- koboldcpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 0db372a320ff0..8b5e914951072 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -931,7 +931,7 @@ def togglemiro(a,b,c): togglemiro(1,1,1) # context size - makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, 4, 20, set=2) + makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 20, set=2) customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale) @@ -1040,7 +1040,7 @@ def export_vars(): args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())]) args.forceversion = 0 if version_var.get()=="" else int(version_var.get()) - args.mirostat = [int(mirostat_var.get()), float(mirostat_tau.get()), float(mirostat_eta.get())] if usemirostat.get()==1 else None + args.usemirostat = [int(mirostat_var.get()), float(mirostat_tau.get()), float(mirostat_eta.get())] if usemirostat.get()==1 else None args.contextsize = int(contextsize_text[context_var.get()]) if customrope_var.get()==1: From 9589d52079e0fcef9fa177aaefc4e2430a652c8b Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 29 Jul 2023 17:33:15 +0800 Subject: [PATCH 15/70] added help link --- koboldcpp.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 8b5e914951072..62ac8e7a7c5e3 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1165,12 +1165,20 @@ def load_config(): import_vars(dict) pass - ctk.CTkButton(tabs , text = "Launch", fg_color="#2f8d3c", command = guilaunch, width=80, height = 35 ).grid(row=1,column=1, stick="se", padx= 25, pady=5) + def display_help(): + try: + import webbrowser as wb + wb.open("https://github.com/LostRuins/koboldcpp/wiki") + except: + print("Cannot launch help browser.") + + ctk.CTkButton(tabs , text = "Launch", fg_color="#2f8d3c", hover_color="#2faa3c", command = guilaunch, width=80, height = 35 ).grid(row=1,column=1, stick="se", padx= 25, pady=5) - ctk.CTkButton(tabs , text = "Save", fg_color="#084a66", command = save_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 5, pady=5) - ctk.CTkButton(tabs , text = "Load", fg_color="#084a66", command = load_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 70, pady=5) + ctk.CTkButton(tabs , text = "Save", fg_color="#084a66", hover_color="#085a88", command = save_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 5, pady=5) + ctk.CTkButton(tabs , text = "Load", fg_color="#084a66", hover_color="#085a88", command = load_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 70, pady=5) + ctk.CTkButton(tabs , text = "Help", fg_color="#992222", hover_color="#bb3333", command = display_help, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 135, pady=5) - ctk.CTkButton(tabs , text = "Old GUI", fg_color="#084a66", command = switch_old_gui, width=100, height = 35 ).grid(row=1,column=0, stick="sw", padx= 5, pady=5) + ctk.CTkButton(tabs , text = "Old GUI", fg_color="#084a66", hover_color="#085a88", command = switch_old_gui, width=100, height = 35 ).grid(row=1,column=0, stick="sw", padx= 5, pady=5) # runs main loop until closed or launch clicked root.mainloop() From 9baf9ef304f330009d5a93b7390280a0fd27c9a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 29 Jul 2023 23:04:10 +0200 Subject: [PATCH 16/70] CUDA: faster multi GPU synchronization (#2448) --- ggml-cuda.cu | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d31fc79c10961..511f48c0a31a6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3529,13 +3529,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm if (split) { // src0 = weight matrix is saved as a transposed matrix for better memory layout. // dst is NOT transposed. - // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. + // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. // Instead they need to be copied to the correct slice in ne0 = dst row index. // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. - for (int64_t j = 0; j < ne1; ++j) { - float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main)); - } + float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float), + i01_diff*sizeof(float), ne1, kind, cudaStream_main)); } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main)); From 11f3ca06b8c66b0427aab0a472479da22553b472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 29 Jul 2023 23:04:44 +0200 Subject: [PATCH 17/70] CUDA: Quantized matrix matrix multiplication (#2160) * mmq implementation for non k-quants * q6_K * q2_K * q3_k * q4_K * vdr * q5_K * faster q8_1 loading * loop unrolling * add __restrict__ * q2_K sc_high * GGML_CUDA_MMQ_Y * Updated Makefile * Update Makefile * DMMV_F16 -> F16 * Updated README, CMakeLists * Fix CMakeLists.txt * Fix CMakeLists.txt * Fix multi GPU out-of-bounds --- CMakeLists.txt | 8 +- Makefile | 15 +- README.md | 6 +- ggml-cuda.cu | 1586 ++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 1293 insertions(+), 322 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c43e65e746320..6e1abeaa1205a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,9 @@ endif() option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +option(LLAMA_CUBLAS "llama: use CUDA" OFF) +option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) +set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels") option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") @@ -251,6 +253,10 @@ if (LLAMA_CUBLAS) set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) add_compile_definitions(GGML_USE_CUBLAS) + if (LLAMA_CUDA_CUBLAS) + add_compile_definitions(GGML_CUDA_CUBLAS) + endif() + add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() diff --git a/Makefile b/Makefile index 2035c525338a7..3d1fff8e55eee 100644 --- a/Makefile +++ b/Makefile @@ -194,7 +194,7 @@ ifdef LLAMA_CUBLAS CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib OBJS += ggml-cuda.o - NVCCFLAGS = --forward-unknown-to-host-compiler + NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math ifdef LLAMA_CUDA_NVCC NVCC = $(LLAMA_CUDA_NVCC) else @@ -220,14 +220,25 @@ else ifdef LLAMA_CUDA_DMMV_Y else NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 endif # LLAMA_CUDA_MMV_Y +ifdef LLAMA_CUDA_F16 + NVCCFLAGS += -DGGML_CUDA_F16 +endif # LLAMA_CUDA_F16 ifdef LLAMA_CUDA_DMMV_F16 - NVCCFLAGS += -DGGML_CUDA_DMMV_F16 + NVCCFLAGS += -DGGML_CUDA_F16 endif # LLAMA_CUDA_DMMV_F16 ifdef LLAMA_CUDA_KQUANTS_ITER NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) else NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif +ifdef LLAMA_CUDA_MMQ_Y + NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) +else + NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64 +endif # LLAMA_CUDA_MMQ_Y +ifdef LLAMA_CUDA_CUBLAS + NVCCFLAGS += -DGGML_CUDA_CUBLAS +endif # LLAMA_CUDA_CUBLAS ifdef LLAMA_CUDA_CCBIN NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif diff --git a/README.md b/README.md index 6a3268d129b55..42fc42b05ae56 100644 --- a/README.md +++ b/README.md @@ -402,10 +402,12 @@ Building the program with BLAS support may lead to some performance improvements | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| + | LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). | + | LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 511f48c0a31a6..0a43fb5dad0f2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -52,13 +52,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); } while (0) #endif // CUDART_VERSION >= 11 -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 typedef half dfloat; // dequantize float typedef half2 dfloat2; #else typedef float dfloat; // dequantize float typedef float2 dfloat2; -#endif //GGML_CUDA_DMMV_F16 +#endif //GGML_CUDA_F16 + +static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { + const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) { + const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) { + return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) { + return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream); @@ -87,8 +115,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 #define QR4_1 2 #define QI4_1 (QK4_1 / (4 * QR4_1)) typedef struct { - half d; // delta - half m; // min + half2 dm; // dm.x = delta, dm.y = min uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); @@ -107,8 +134,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5 #define QR5_1 2 #define QI5_1 (QK5_1 / (4 * QR5_1)) typedef struct { - half d; // delta - half m; // min + half2 dm; // dm.x = delta, dm.y = min uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; @@ -127,13 +153,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo #define QR8_1 1 #define QI8_1 (QK8_1 / (4 * QR8_1)) typedef struct { - half d; // delta - half s; // unquantized sum + half2 ds; // ds.x = delta, ds.y = sum int8_t qs[QK8_0]; // quants } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); -typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs); +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); +typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc); +typedef void (*load_tiles_cuda_t)( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row); +typedef float (*vec_dot_q_mul_mat_cuda_t)( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k); //================================= k-quants @@ -150,8 +182,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_ typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t qs[QK_K/4]; // quants - half d; // super-block scale for quantized scales - half dmin; // super-block scale for quantized mins + half2 dm; // super-block scale for quantized scales/mins } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); @@ -180,8 +211,7 @@ typedef struct { static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); #else typedef struct { - half d; // super-block scale for quantized scales - half dmin; // super-block scale for quantized mins + half2 dm; // super-block scale for quantized scales/mins uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; @@ -200,11 +230,10 @@ typedef struct { static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); #else typedef struct { - half d; // super-block scale for quantized scales - half dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits + half2 dm; // super-block scale for quantized scales/mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits } block_q5_K; static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); #endif @@ -233,6 +262,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_QUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 +#ifndef GGML_CUDA_MMQ_Y +#define GGML_CUDA_MMQ_Y 64 +#endif // GGML_CUDA_MMQ_Y + // dmmv = dequantize_mul_mat_vec #ifndef GGML_CUDA_DMMV_X #define GGML_CUDA_DMMV_X 32 @@ -367,33 +400,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in v.x = vui & 0xF; v.y = vui >> 4; -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 v = __hsub2(v, {8.0f, 8.0f}); v = __hmul2(v, {d, d}); #else v.x = (v.x - 8.0f) * d; v.y = (v.y - 8.0f) * d; -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 } static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q4_1 * x = (const block_q4_1 *) vx; - const dfloat d = x[ib].d; - const dfloat m = x[ib].m; + const dfloat d = x[ib].dm.x; + const dfloat m = x[ib].dm.y; const int vui = x[ib].qs[iqs]; v.x = vui & 0xF; v.y = vui >> 4; -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 v = __hmul2(v, {d, d}); v = __hadd2(v, {m, m}); #else v.x = (v.x * d) + m; v.y = (v.y * d) + m; -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 } static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ @@ -410,20 +443,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); v.y = ((x[ib].qs[iqs] >> 4) | xh_1); -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 v = __hsub2(v, {16.0f, 16.0f}); v = __hmul2(v, {d, d}); #else v.x = (v.x - 16.0f) * d; v.y = (v.y - 16.0f) * d; -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 } static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q5_1 * x = (const block_q5_1 *) vx; - const dfloat d = x[ib].d; - const dfloat m = x[ib].m; + const dfloat d = x[ib].dm.x; + const dfloat m = x[ib].dm.y; uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); @@ -434,13 +467,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); v.y = ((x[ib].qs[iqs] >> 4) | xh_1); -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 v = __hmul2(v, {d, d}); v = __hadd2(v, {m, m}); #else v.x = (v.x * d) + m; v.y = (v.y * d) + m; -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 } static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ @@ -451,12 +484,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in v.x = x[ib].qs[iqs + 0]; v.y = x[ib].qs[iqs + 1]; -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 v = __hmul2(v, {d, d}); #else v.x *= d; v.y *= d; -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 } //================================== k-quants @@ -475,8 +508,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; - float dall = x[i].d; - float dmin = x[i].dmin; + float dall = x[i].dm.x; + float dmin = x[i].dm.y; y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); @@ -486,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int il = tid%16; // 0...15 const uint8_t q = x[i].qs[il] >> (2*is); float * y = yy + i*QK_K + 16*is + il; - float dall = x[i].d; - float dmin = x[i].dmin; + float dall = x[i].dm.x; + float dmin = x[i].dm.y; y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); #endif @@ -573,8 +606,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + n*ir; - const float dall = x[i].d; - const float dmin = x[i].dmin; + const float dall = x[i].dm.x; + const float dmin = x[i].dm.y; const uint8_t * q = x[i].qs + 32*il + n*ir; @@ -612,8 +645,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + 2*ir; - const float dall = x[i].d; - const float dmin = x[i].dmin; + const float dall = x[i].dm.x; + const float dmin = x[i].dm.y; const uint8_t * ql = x[i].qs + 32*il + 2*ir; const uint8_t * qh = x[i].qh + 2*ir; @@ -725,8 +758,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; - const float dall = x[i].d; - const float dmin = x[i].dmin; + const float dall = x[i].dm.x; + const float dmin = x[i].dm.y; const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); aux[0] = a[0] & 0x0f0f0f0f; @@ -768,9 +801,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, uaux[0] = s[0] & 0x0f0f0f0f; uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; - const half2 * dh = (const half2 *)&x[i].d; - - const float2 dall = __half22float2(dh[0]); + const float2 dall = __half22float2(x[i].dm); float sum1 = 0, sum2 = 0; for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { @@ -948,8 +979,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].d; - const float dmin = x[i].dmin; + const float dall = x[i].dm.x; + const float dmin = x[i].dm.y; const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1081,8 +1112,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].d; - const float dmin = x[i].dmin; + const float dall = x[i].dm.x; + const float dmin = x[i].dm.y; const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1270,19 +1301,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, v.y = x[ib + iqs + 1]; } -static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { + const int ix = blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { + if (ix >= kx_padded) { return; } + const int iy = blockDim.y*blockIdx.y + threadIdx.y; + + const int i_padded = iy*kx_padded + ix; + block_q8_1 * y = (block_q8_1 *) vy; - const int ib = i / QK8_1; // block index - const int iqs = i % QK8_1; // quant index + const int ib = i_padded / QK8_1; // block index + const int iqs = i_padded % QK8_1; // quant index - const float xi = i < ndata ? x[i] : 0.0f; + const float xi = ix < kx ? x[iy*kx + ix] : 0.0f; float amax = fabsf(xi); float sum = xi; @@ -1301,8 +1336,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest return; } - y[ib].d = d; - y[ib].s = sum; + y[ib].ds.x = d; + y[ib].ds.y = sum; } template @@ -1326,45 +1361,79 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __ y[iybs + iqs + y_offset] = v.y; } -static __device__ __forceinline__ float vec_dot_q4_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; +// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called - int vi; - memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int)); - const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); - const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]); +#define VDR_q4_0_q8_1 1 - const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d); +static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( + const int & vi, const int & ui0, const int & ui1, const half & d4, const half2 & ds8) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics // subtract 8 from each quantized value - const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808); - const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808); + const int vi0 = (vi >> 0) & 0x0F0F0F0F; + const int vi1 = (vi >> 4) & 0x0F0F0F0F; // SIMD dot product of quantized values int sumi = __dp4a(vi0, ui0, 0); sumi = __dp4a(vi1, ui1, sumi); - return sumi*d; + return __half2float(d4) * (sumi * __half2float(ds8.x) - (8/QI4_0) * __half2float(ds8.y)); #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q4_1_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; +static __device__ __forceinline__ float vec_dot_q4_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; + + const int vi = get_int_from_uint8(bq4_0->qs, iqs); + const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs); + const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI4_0); - const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]); - const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); - const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]); + return vec_dot_q4_0_q8_1_impl(vi, ui0, ui1, bq4_0->d, bq8_1->ds); +} + +static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0)]; + + *x_ql = tile_x_qs; + *x_dm = tile_x_d; +} - const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d); - const float m = bq4_1->m; - const float s = bq8_1->s; +static __device__ __forceinline__ void load_tiles_q4_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + const int kbx = k / QI4_0; + const int kqsx = k % QI4_0; + + const block_q4_0 * bx = ((block_q4_0 *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->qs, kqsx); + x_dm[i * (WARP_SIZE / QI4_0) + kbx].x = bx->d; +} + +static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + + return vec_dot_q4_0_q8_1_impl( + x_ql[i * (WARP_SIZE + 1) + k], y_qs[j * (2*WARP_SIZE) + kyqs], y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], + x_dm[i * (WARP_SIZE/QI4_0) + k/QI4_0].x, y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); +} + +#define VDR_q4_1_q8_1 1 + +static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( + const int & vi, const int & ui0, const int & ui1, const half2 & dm4, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const int vi0 = (vi >> 0) & 0x0F0F0F0F; const int vi1 = (vi >> 4) & 0x0F0F0F0F; @@ -1372,184 +1441,421 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( int sumi = __dp4a(vi0, ui0, 0); sumi = __dp4a(vi1, ui1, sumi); - return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block +#ifdef GGML_CUDA_F16 + const half2 tmp = __hmul2(dm4, ds8); + const float d4d8 = __half2float(tmp.x); + const float m4s8 = __half2float(tmp.y); +#else + const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x); + const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y); +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/QR4_1 to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / QR4_1); #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q5_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q4_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; + + const int vi = get_int_from_uint8_aligned(bq4_1->qs, iqs); + const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs); + const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI4_1); + + return vec_dot_q4_1_q8_1_impl(vi, ui0, ui1, bq4_1->dm, bq8_1->ds); +} + +static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1)]; + + *x_ql = tile_x_qs; + *x_dm = tile_x_dm; +} + +static __device__ __forceinline__ void load_tiles_q4_1( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI4_1; + const int kqsx = k % QI4_1; + + const block_q4_1 * bx = ((block_q4_1 *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bx->qs, kqsx); + x_dm[i * (WARP_SIZE / QI4_1) + kbx] = bx->dm; +} + +static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + + return vec_dot_q4_1_q8_1_impl( + x_ql[i * (WARP_SIZE + 1) + k], y_qs[j * (2*WARP_SIZE) + kyqs], y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], + x_dm[i * (WARP_SIZE/QI4_1) + k/QI4_1], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); +} + +#define VDR_q5_0_q8_1 1 + +static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( + const int & qs, const int & qh, const int & ui0, const int & ui1, const half & d5, const half2 & ds8) { + #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (qh << 4) & 0x00000010; // 0 -> 4 + vi0 |= (qh << 11) & 0x00001000; // 1 -> 12 + vi0 |= (qh << 18) & 0x00100000; // 2 -> 20 + vi0 |= (qh << 25) & 0x10000000; // 3 -> 28 + int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values + + int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (qh << 2) & 0x00100000; // 18 -> 20 + vi1 |= (qh << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values + + return __half2float(d5) * (sumi*__half2float(ds8.x) - (16/QI5_0) * __half2float(ds8.y)); +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q5_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; - int qs; - memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int)); - const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2); - const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2); - const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); - const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]); - - const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d); - - int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits - vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5 - vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13 - vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21 - vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29 - vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values + const int qs = get_int_from_uint8(bq5_0->qs, iqs); + const int qh = get_int_from_uint8(bq5_0->qh, 0) >> (4 * iqs); + const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs); + const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI5_0); + + return vec_dot_q5_0_q8_1_impl(qs, qh, ui0, ui1, bq5_0->d, bq8_1->ds); +} + +static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0)]; + __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0)]; + + *x_ql = tile_x_ql; + *x_qh = tile_x_qh; + *x_dm = tile_x_d; +} + +static __device__ __forceinline__ void load_tiles_q5_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI5_0; + const int kqsx = k % QI5_0; + + const block_q5_0 * bx = ((block_q5_0 *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->qs, kqsx); + x_qh[i * (WARP_SIZE / QI5_0) + kbx] = get_int_from_uint8(bx->qh, 0); + x_dm[i * (WARP_SIZE / QI5_0) + kbx].x = bx->d; +} + +static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_0) + k/QI5_0; + + return vec_dot_q5_0_q8_1_impl( + x_ql[i * (WARP_SIZE + 1) + k], x_qh[index_bx] >> (4 * (k % QI5_0)), y_qs[j * (2*WARP_SIZE) + kyqs], + y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], x_dm[index_bx].x, y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); +} + +#define VDR_q5_1_q8_1 1 + +static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( + const int & qs, const int & qh, const int & ui0, const int & ui1, const half2 & dm5, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits + vi0 |= (qh << 4) & 0x00000010; // 0 -> 4 + vi0 |= (qh << 11) & 0x00001000; // 1 -> 12 + vi0 |= (qh << 18) & 0x00100000; // 2 -> 20 + vi0 |= (qh << 25) & 0x10000000; // 3 -> 28 int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values - int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits - vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5 - vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13 - vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21 - vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29 - vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values + int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits + vi1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (qh << 2) & 0x00100000; // 18 -> 20 + vi1 |= (qh << 9) & 0x10000000; // 19 -> 28 sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values - return sumi*d; +#ifdef GGML_CUDA_F16 + const half2 tmp = __hmul2(dm5, ds8); + const float d5d8 = __half2float(tmp.x); + const float m5s8 = __half2float(tmp.y); +#else + const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x); + const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y); +#endif // GGML_CUDA_F16 + + return sumi*d5d8 + m5s8/QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block + #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; - const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]); - const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2); - const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2); - const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); - const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]); - - const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d); - const float m = bq5_1->m; - const float s = bq8_1->s; - - int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits - vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5 - vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13 - vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21 - vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29 - int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values + const int qs = get_int_from_uint8_aligned(bq5_1->qs, iqs); + const int qh = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * iqs); + const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs); + const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI5_1); - int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits - vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5 - vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13 - vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21 - vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29 - sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values + return vec_dot_q5_1_q8_1_impl(qs, qh, ui0, ui1, bq5_1->dm, bq8_1->ds); +} + +static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1)]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1)]; + + *x_ql = tile_x_ql; + *x_qh = tile_x_qh; + *x_dm = tile_x_dm; +} + +static __device__ __forceinline__ void load_tiles_q5_1( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI5_1; + const int kqsx = k % QI5_1; + + const block_q5_1 * bx = ((block_q5_1 *) vx) + i*blocks_per_row + kbx; - return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->qs, kqsx); + x_qh[i * (WARP_SIZE / QI5_1) + kbx] = get_int_from_uint8(bx->qh, 0); + x_dm[i * (WARP_SIZE / QI5_1) + kbx] = bx->dm; +} + +static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_0) + k/QI5_0; + + return vec_dot_q5_1_q8_1_impl( + x_ql[i * (WARP_SIZE + 1) + k], x_qh[index_bx] >> (4 * (k % QI5_1)), y_qs[j * (2*WARP_SIZE) + kyqs], + y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); +} + +#define VDR_q8_0_q8_1 1 + +static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( + const int & vi, const int & ui, const half & d8_0, const half2 & ds8_1) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + // SIMD dot product of quantized values + const int sumi = __dp4a(vi, ui, 0); + + return sumi * __half2float(d8_0) * __half2float(ds8_1.x); #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } static __device__ __forceinline__ float vec_dot_q8_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; - int vi; - memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int)); - const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); + const int vi = get_int_from_int8(bq8_0->qs, iqs); + const int ui = get_int_from_int8_aligned(bq8_1->qs, iqs); - const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d); + return vec_dot_q8_0_q8_1_impl(vi, ui, bq8_0->d, bq8_1->ds); +} - // SIMD dot product of quantized values - int sumi = __dp4a(vi, ui, 0); +static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0)]; + + *x_ql = tile_x_qs; + *x_dm = tile_x_d; +} + +static __device__ __forceinline__ void load_tiles_q8_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI8_0; + const int kqsx = k % QI8_0; + + const block_q8_0 * bx = ((block_q8_0 *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bx->qs, kqsx); + x_dm[i * (WARP_SIZE / QI8_0) + kbx].x = bx->d; +} + +static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + return vec_dot_q8_0_q8_1_impl( + x_ql[i * (WARP_SIZE + 1) + k], y_qs[j*WARP_SIZE + k], + x_dm[i * (WARP_SIZE/QI8_0) + k/QI8_0].x, y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]); +} + +#define VDR_q2_K_q8_1 1 + +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl( + const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + int sc_high = sc >> 4; + sc_high |= sc_high << 8; + sc_high |= sc_high << 16; + sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + } + + const float2 dmf = __half22float2(dm); - return sumi*d; + return dmf.x*sumf_d - dmf.y*sumf_m; #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } static __device__ __forceinline__ float vec_dot_q2_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q2_K * bq2_K = (const block_q2_K *) vbq; const int bq8_offset = QR2_K * (iqs / QI8_1); const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); - float sumf_d = 0.0f; - float sumf_m = 0.0f; + const uint8_t * scales = bq2_K->scales + scale_offset; - const float d = bq2_K->d; - const float dmin = bq2_K->dmin; + const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + int u[QR2_K]; + float d8[QR2_K]; - const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]); + for (int i = 0; i < QR2_K; ++ i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds.x; + } - for (int i = 0; i < QR2_K; ++i) { - const int sc = bq2_K->scales[scale_offset + 2*i]; + return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8); +} - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - const float d8i = bq8i->d; +static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - const int vi = (v >> (2*i)) & 0x03030303; - const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE / QI2_K)]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE / 4)]; - sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product - sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values - } + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} - return d*sumf_d - dmin*sumf_m; -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +static __device__ __forceinline__ void load_tiles_q2_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI2_K; + const int kqsx = k % QI2_K; + + const block_q2_K * bx = ((block_q2_K *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bx->qs, kqsx); + x_dm[i * (WARP_SIZE / QI2_K) + kbx] = bx->dm; + x_sc[i * (WARP_SIZE / 4) + k/4] = get_int_from_uint8_aligned(bx->scales, kqsx / 4); } -static __device__ __forceinline__ float vec_dot_q3_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_q3_K * bq3_K = (const block_q3_K *) vbq; + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k < WARP_SIZE); - const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); - const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + const int kbx = k / QI2_K; + const int kqsx = k % QI2_K; - float sumf = 0.0f; + const int bq8_offset = QR2_K * (kqsx / QI8_1); + const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2); - const float d = bq3_K->d; + const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4))) + kbx*16 + scale_offset; + + int u[QR2_K]; + float d8[QR2_K]; + + for (int l = 0; l < QR2_K; ++ l) { + const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1; + u[l] = y_qs[y_qs_index]; + d8[l] = y_ds[y_qs_index / QI8_1].x; + } - int vl; - memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int)); + return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + kbx], d8); +} + +#define VDR_q3_K_q8_1 1 + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl( + const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const int & scale_offset, const float & d, const float * __restrict__ d8) { - int vh; - memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int)); - vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted - vh >>= bq8_offset; +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; for (int i = 0; i < QR3_K; ++i) { const int isc = scale_offset + 2*i; const int isc_low = isc % (QK_K/32); const int sc_shift_low = 4 * (isc / (QK_K/32)); - const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF; + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; const int isc_high = isc % (QK_K/64); const int sc_shift_high = 2 * (isc / (QK_K/64)); - const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; const int sc = (sc_low | sc_high) - 32; - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); - const float d8i = bq8i->d; - const int vil = (vl >> (2*i)) & 0x03030303; const int vih = ((vh >> i) << 2) & 0x04040404; const int vi = __vsubss4(vil, vih); - sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product } return d*sumf; @@ -1558,31 +1864,136 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q4_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q3_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_q4_K * bq4_K = (const block_q4_K *) vbq; + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + const float d = bq3_K->d; + + const int vl = get_int_from_uint8(bq3_K->qs, iqs); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + + for (int i = 0; i < QR3_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds.x; + } + + return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8); +} + +static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE / QI2_K)]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE / 2)]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE / 4)]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_qh = tile_x_qh; + *x_sc = tile_x_sc; +} + +static __device__ __forceinline__ void load_tiles_q3_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI3_K; + const int kqsx = k % QI3_K; + + const block_q3_K * bx = ((block_q3_K *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->qs, kqsx); + x_dm[i * (WARP_SIZE / QI3_K) + kbx].x = bx->d; + x_qh[i * (WARP_SIZE / 2) + k/2] = get_int_from_uint8(bx->hmask, kqsx / 2); + x_sc[i * (WARP_SIZE / 4) + k/4] = get_int_from_uint8(bx->scales, kqsx / 4); +} + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kbx = k / QI3_K; + const int kqsx = k % QI3_K; + + const int bq8_offset = QR3_K * (kqsx / (QI3_K/2)); + const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2); + + const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4))) + kbx*16; + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~x_qh[i * (WARP_SIZE/2) + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + + for (int l = 0; l < QR3_K; ++ l) { + const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1; + u[l] = y_qs[y_qs_index]; + d8[l] = y_ds[y_qs_index / QI8_1].x; + } + + return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset, x_dm[i * (WARP_SIZE/QI3_K) + kbx].x, d8); +} + +#define VDR_q4_K_q8_1 2 + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m; + +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + #ifndef GGML_QKK_64 + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6 const int bq8_offset = QR4_K * (iqs / (QI8_1/2)); - const float d = bq4_K->d; - const float dmin = bq4_K->dmin; - // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4)); - const int v1 = q4[0]; - const int v2 = q4[4]; + v[0] = q4[0]; + v[1] = q4[4]; const uint16_t * scales = (const uint16_t *)bq4_K->scales; uint16_t aux[2]; @@ -1598,60 +2009,161 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const uint8_t * m = sc + 2; for (int i = 0; i < QR4_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - const float d8i = bq8i->d; - const int * q8 = (const int *)bq8i->qs + (iqs%4); - const int ui1 = q8[0]; - const int ui2 = q8[4]; - - const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F; - const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F; - - const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + d8[i] = bq8i->ds.x; - sumf_d += d8i * (dot1 * sc[i]); - sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + const int * q8 = (const int *)bq8i->qs + (iqs%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; } - return d*sumf_d - dmin*sumf_m; + return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8); #else +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + uint16_t aux16[2]; const uint8_t * s = (const uint8_t *)aux16; - const uint16_t * a = (const uint16_t *)bq4_K->scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; + const uint16_t * a = (const uint16_t *)bq4_K->scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const float dall = bq4_K->d[0]; + const float dmin = bq4_K->d[1]; + + const float d8_1 = bq8_1[0].ds.x; + const float d8_2 = bq8_1[1].ds.x; + + const int ui1 = *((const int *)bq8_1[0].qs + iqs); + const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4); + const int ui3 = *((const int *)bq8_1[1].qs + iqs); + const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4); + + const int * q4 = (const int *)bq4_K->qs + iqs; + const int v1 = q4[0]; + const int v2 = q4[4]; + + const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); + const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); + const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); + + sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); + sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + + return dall * sumf_d - dmin * sumf_m; + +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K)]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (3*WARP_SIZE/32)]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +static __device__ __forceinline__ void load_tiles_q4_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI4_K; + const int kqsx = k % QI4_K; + + const block_q4_K * bx = ((block_q4_K *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bx->qs, kqsx); + x_dm[i * (WARP_SIZE / QI6_K) + kbx] = bx->dm; + x_sc[i * (3*WARP_SIZE/32) + k % (3*WARP_SIZE/32)] = get_int_from_uint8_aligned(bx->scales, k % (3*WARP_SIZE/32)); +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + + // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * (kqsx / (QI8_1/2)); + + v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 0]; + v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 4]; + + const uint16_t * scales = (const uint16_t *) &x_sc[i * (3*WARP_SIZE/32) + kbx * (3*WARP_SIZE/32)]; + uint16_t aux[2]; + const int l = bq8_offset/2; + if (l < 2) { + aux[0] = scales[l+0] & 0x3f3f; + aux[1] = scales[l+2] & 0x3f3f; + } else { + aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2); + aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int l = 0; l < QR4_K; ++l) { + const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + kqsx % (QI8_1/2); + u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)]; + u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)]; + d8[l] = y_ds[kqsy / QI8_1].x; + } + + return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + kbx], d8); +} + +#define VDR_q5_K_q8_1 2 - const float dall = bq4_K->d[0]; - const float dmin = bq4_K->d[1]; +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( + const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { - const float d8_1 = bq8_1[0].d; - const float d8_2 = bq8_1[1].d; +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; - const int ui1 = *((const int *)bq8_1[0].qs + iqs); - const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4); - const int ui3 = *((const int *)bq8_1[1].qs + iqs); - const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4); + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; - const int * q4 = (const int *)bq4_K->qs + iqs; - const int v1 = q4[0]; - const int v2 = q4[4]; + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; - const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); - const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); - const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); - const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; - sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); - sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u - return dall * sumf_d - dmin * sumf_m; + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); -#endif + } + + return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m; #else return 0.0f; // only to satisfy the compiler @@ -1659,28 +2171,25 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( } static __device__ __forceinline__ float vec_dot_q5_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +#ifndef GGML_QKK_64 const block_q5_K * bq5_K = (const block_q5_K *) vbq; -#ifndef GGML_QKK_64 + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; const int bq8_offset = QR5_K * (iqs / (QI8_1/2)); const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4)); const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4)); - float sumf_d = 0.0f; - float sumf_m = 0.0f; - - const float d = bq5_K->d; - const float dmin = bq5_K->dmin; + vl[0] = ql[0]; + vl[1] = ql[4]; - const int vl1 = ql[0]; - const int vl2 = ql[4]; - - const int vh1 = qh[0] >> bq8_offset; - const int vh2 = qh[4] >> bq8_offset; + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; const uint16_t * scales = (const uint16_t *)bq5_K->scales; uint16_t aux[2]; @@ -1696,40 +2205,27 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const uint8_t * m = sc + 2; for (int i = 0; i < QR5_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - const float d8i = bq8i->d; - const int * q8 = (const int *)bq8i->qs + (iqs%4); - const int ui1 = q8[0]; - const int ui2 = q8[4]; - - const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F; - const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F; - - const int vih1 = ((vh1 >> i) << 4) & 0x10101010; - const int vih2 = ((vh2 >> i) << 4) & 0x10101010; - - const int vi1 = vil1 | vih1; - const int vi2 = vil2 | vih2; - - const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); - - sumf_d += d8i * (dot1 * sc[i]); - sumf_m += d8i * (dot2 * m[i]); + d8[i] = bq8i->ds.x; + const int * q8 = (const int *)bq8i->qs + (iqs%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; } - return d*sumf_d - dmin*sumf_m; + return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8); #else +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + const int8_t * s = bq5_K->scales; const float d = bq5_K->d; - const float d8_1 = bq8_1[0].d; - const float d8_2 = bq8_1[1].d; + const float d8_1 = bq8_1[0].ds.x; + const float d8_2 = bq8_1[1].ds.x; const int ui1 = *((const int *)bq8_1[0].qs + iqs); const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4); @@ -1755,56 +2251,304 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( return d * sumf_d; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + #endif +} + +static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K)]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4)]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (3*WARP_SIZE/32)]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_qh = tile_x_qh; + *x_sc = tile_x_sc; +} + +static __device__ __forceinline__ void load_tiles_q5_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI5_K; + const int kqsx = k % QI5_K; + + const block_q5_K * bx = ((block_q5_K *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bx->qs, kqsx); + x_dm[i * (WARP_SIZE / QI6_K) + kbx] = bx->dm; + x_qh[i * (WARP_SIZE / 4) + k/4] = get_int_from_uint8_aligned(bx->qh, kqsx/4); + x_sc[i * (3*WARP_SIZE/32) + k % (3*WARP_SIZE/32)] = get_int_from_uint8_aligned(bx->scales, k % (3*WARP_SIZE/32)); +} + +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + __builtin_assume(i < 2*WARP_SIZE); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + int vl[2]; + int vh[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + const int bq8_offset = QR5_K * (kqsx / (QI8_1/2)); + + vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 0]; + vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 4]; + + vh[0] = x_qh[i * (WARP_SIZE/4) + kqsx % 4 + 0] >> bq8_offset; + vh[1] = x_qh[i * (WARP_SIZE/4) + kqsx % 4 + 4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *) &x_sc[i * (3*WARP_SIZE/32) + kbx * (3*WARP_SIZE/32)]; + uint16_t aux[2]; + const int l = bq8_offset/2; + if (l < 2) { + aux[0] = scales[l+0] & 0x3f3f; + aux[1] = scales[l+2] & 0x3f3f; + } else { + aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2); + aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int l = 0; l < QR5_K; ++l) { + const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + kqsx % (QI8_1/2); + u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)]; + u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)]; + d8[l] = y_ds[kqsy / QI8_1].x; + } + + return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + kbx], d8); +} + +#define VDR_q6_K_q8_1 1 + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl( + const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } static __device__ __forceinline__ float vec_dot_q6_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q6_K * bq6_K = (const block_q6_K *) vbq; const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); - float sumf = 0.0f; - - const float d = bq6_K->d; + const int vl = get_int_from_uint8(bq6_K->ql, iqs); + const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; - int vl; - memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int)); + const int8_t * scales = bq6_K->scales + scale_offset; - int vh; - memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int)); + int u[QR6_K]; + float d8[QR6_K]; for (int i = 0; i < QR6_K; ++i) { - const int sc = bq6_K->scales[scale_offset + 4*i]; + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + 2*i].ds.x; + } - const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i; - const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]); - const float d8i = bq8i->d; + return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8); +} - const int vil = (vl >> (4*i)) & 0x0F0F0F0F; +static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K)]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2)]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8)]; - const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_qh = tile_x_qh; + *x_sc = tile_x_sc; +} + +static __device__ __forceinline__ void load_tiles_q6_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + + const int kbx = k / QI6_K; + const int kqsx = k % QI6_K; + + const block_q6_K * bx = ((block_q6_K *) vx) + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->ql, kqsx); + x_dm[i * (WARP_SIZE / QI6_K) + kbx].x = bx->d; + x_qh[i * (WARP_SIZE / 2) + k/2] = get_int_from_uint8(bx->qh, kqsx/2); + x_sc[i * (WARP_SIZE / 8) + k/8] = get_int_from_int8(bx->scales, kqsx/8); +} + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)); + + const int vh = x_qh[i * (WARP_SIZE/2) + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift; + + const int x_sc_offset = i * (WARP_SIZE/8) + kbx * (QI6_K/8); + const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; - sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + for (int l = 0; l < QR6_K; ++l) { + const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1; + u[l] = y_qs[kqsy]; + d8[l] = y_ds[kqsy / QI8_1].x; } - return d*sumf; -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, x_dm[i * (WARP_SIZE/QI6_K) + kbx].x, d8); +} + +template +static __global__ void mul_mat_q( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + const int blocks_per_warp = WARP_SIZE / qi; + + const int & ncols_dst = ncols_y; + + const int tid_x = threadIdx.x; + const int tid_y = threadIdx.y; + + const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y; + const int & row_x_0 = row_dst_0; + const int row_dst = row_dst_0 + tid_x; + + const int col_dst_0 = blockIdx.y*WARP_SIZE; + const int & col_y_0 = col_dst_0; + + int * tile_x_ql = nullptr; + half2 * tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + + allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + + const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1; + + __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)]; + __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col]; + + float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f}; + + for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { + + for (int i = 0; i < GGML_CUDA_MMQ_Y; i += 8) { + load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, + i + tid_y, tid_x, blocks_per_row_x); + } + + for (int ir = 0; ir < qr; ++ir) { + const int kqs = ir*WARP_SIZE + tid_x; + const int kby = kqs / QI8_1; + + for (int i = 0; i < WARP_SIZE; i += 8) { + const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses + + const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby]; + + tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1); + } + } + + for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) { + const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE; + const int kby = tid_x % blocks_per_tile_y_col; + const int col_y_eff = min(col_y_0 + ids, ncols_y-1); + tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds; + } + + __syncthreads(); + +#if __CUDA_ARCH__ >= 700 // TODO: actually test this with compute capability 7.X cards +#pragma unroll +#endif // __CUDA_ARCH__ >= 700 + for (int k = 0; k < WARP_SIZE/vdr; ++k) { +#pragma unroll + for (int j = 0; j < WARP_SIZE; j += 8) { +#pragma unroll + for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { + sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, + tid_x + i, tid_y + j, k); + } + } + } + + __syncthreads(); + } + + + if (row_dst >= nrows_dst) { + return; + } + + for (int j = 0; j < WARP_SIZE; j += 8) { + const int col_dst = col_dst_0 + j + tid_y; + + if (col_dst >= ncols_dst) { + return; + } + + for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { + dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8]; + } + } } -template +template static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; @@ -1813,7 +2557,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; // partial sum for each thread float tmp = 0.0f; @@ -1822,11 +2566,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * const block_q8_1 * y = (const block_q8_1 *) vy; for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index + const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index - const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx + const int iby = (i + threadIdx.x / (qi/vdr)) * qk/QK8_1; // y block index that aligns with ibx - const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int + const int iqs = threadIdx.x % (qi/vdr); // x block quant index when casting the quants to int tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); } @@ -1859,11 +2603,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons const int y_offset = qr == 1 ? 1 : qk/2; // partial sum for each thread -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics #else float tmp = 0.0f; -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 for (int i = 0; i < ncols; i += iter_stride) { const int col = i + vals_per_iter*tid; @@ -1883,7 +2627,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons // matrix multiplication // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 tmp += __hmul2(v, { y[iybs + iqs + j/qr + 0], y[iybs + iqs + j/qr + y_offset] @@ -1891,7 +2635,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons #else tmp += v.x * y[iybs + iqs + j/qr + 0]; tmp += v.y * y[iybs + iqs + j/qr + y_offset]; -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 } } @@ -1902,11 +2646,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons } if (tid == 0) { -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 dst[row] = tmp.x + tmp.y; #else dst[row] = tmp; -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 } } @@ -2203,9 +2947,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con rms_norm_f32<<>>(x, dst, ncols, eps); } -static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - quantize_q8_1<<>>(x, vy, ndata, k); +static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) { + const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const dim3 num_blocks(block_num_x, ky, 1); + const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1); + quantize_q8_1<<>>(x, vy, kx, kx_padded); } static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -2366,7 +3112,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2375,7 +3121,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2384,7 +3130,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2393,7 +3139,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2402,7 +3148,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2411,7 +3157,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2420,7 +3166,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2429,10 +3175,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per - // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales - // is better amortized. - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2441,10 +3184,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per - // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales - // is better amortized. - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2453,7 +3193,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2500,6 +3240,126 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { } } +static void ggml_mul_mat_q4_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q4_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q5_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q5_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q8_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q2_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q3_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q4_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q5_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + +static void ggml_mul_mat_q6_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; + const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + mul_mat_q + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +} + static void ggml_mul_mat_p021_f16_f32_cuda( const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, cudaStream_t stream) { @@ -2965,6 +3825,83 @@ inline void ggml_cuda_op_rms_norm( (void) i1; } +inline void ggml_cuda_op_mul_mat_q( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddq_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t i01_diff = i01_high - i01_low; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into + const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff; + + const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ? + ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; + size_t as; + void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as); + quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main); + + switch (src0->type) { + case GGML_TYPE_Q4_0: + ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q4_1: + ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q5_0: + ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q5_1: + ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q8_0: + ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q2_K: + ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q3_K: + ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q4_K: + ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q5_K: + ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q6_K: + ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + default: + GGML_ASSERT(false); + break; + } + + ggml_cuda_pool_free(src1_q8_1, as); + + (void) src1; + (void) dst; + (void) src0_ddf_i; + (void) i02; + (void) i1; +} + inline void ggml_cuda_op_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, @@ -3006,7 +3943,7 @@ inline void ggml_cuda_op_mul_mat_vec( ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; size_t as; void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); - quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main); + quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main); switch (src0->type) { case GGML_TYPE_Q4_0: @@ -3047,7 +3984,7 @@ inline void ggml_cuda_op_mul_mat_vec( ggml_cuda_pool_free(src1_q8_1, as); } else { // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 size_t ash; dfloat * src1_dfloat = nullptr; // dfloat == half @@ -3063,7 +4000,7 @@ inline void ggml_cuda_op_mul_mat_vec( } #else dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 switch (src0->type) { case GGML_TYPE_Q4_0: @@ -3104,11 +4041,11 @@ inline void ggml_cuda_op_mul_mat_vec( break; } -#ifdef GGML_CUDA_DMMV_F16 +#ifdef GGML_CUDA_F16 if (src1_convert_f16) { ggml_cuda_pool_free(src1_dfloat, ash); } -#endif // GGML_CUDA_DMMV_F16 +#endif // GGML_CUDA_F16 } (void) src1; @@ -3363,7 +4300,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm int64_t row_low, row_high; if (split) { row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; + row_low -= row_low % GGML_CUDA_MMQ_Y; + row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1]; + row_high -= row_high % GGML_CUDA_MMQ_Y; } else { row_low = 0; row_high = nrows0*i02_divisor; @@ -3717,7 +4657,16 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false); } else { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); +#ifdef GGML_CUDA_CUBLAS + const bool use_mul_mat_q = false; +#else + const bool use_mul_mat_q = ggml_is_quantized(src0->type); +#endif // GGML_CUDA_CUBLAS + if (use_mul_mat_q) { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false); + } else { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); + } } } else { GGML_ASSERT(false); @@ -3827,7 +4776,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { row_high = nrows; } else if (backend == GGML_BACKEND_GPU_SPLIT) { row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; + row_low -= row_low % GGML_CUDA_MMQ_Y; + row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1]; + row_high -= row_high % GGML_CUDA_MMQ_Y; } else { GGML_ASSERT(false); } From 23825abee1717d5b44d935aac3d16eaf5198403a Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 30 Jul 2023 14:30:46 +0800 Subject: [PATCH 18/70] fix wrong key --- koboldcpp.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 62ac8e7a7c5e3..15507e7e0b3c3 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1114,11 +1114,11 @@ def import_vars(dict): if "forceversion" in dict and dict["forceversion"]: version_var.set(str(dict["forceversion"])) - if "mirostat" in dict and dict["mirostat"] and len(dict["mirostat"])>1: - usemirostat.set(0 if str(dict["mirostat"][0])=="0" else 1) - mirostat_var.set(str(dict["mirostat"][0])) - mirostat_tau.set(str(dict["mirostat"][1])) - mirostat_eta.set(str(dict["mirostat"][2])) + if "usemirostat" in dict and dict["usemirostat"] and len(dict["usemirostat"])>1: + usemirostat.set(0 if str(dict["usemirostat"][0])=="0" else 1) + mirostat_var.set(str(dict["usemirostat"][0])) + mirostat_tau.set(str(dict["usemirostat"][1])) + mirostat_eta.set(str(dict["usemirostat"][2])) if "model_param" in dict and dict["model_param"]: model_var.set(dict["model_param"]) From 45456fa6ca21c1f89595e9d0846d517e5a54d6de Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 30 Jul 2023 16:47:33 +0800 Subject: [PATCH 19/70] switch noavx2 to not use openblas, as it has incompatible instructions --- .gitignore | 4 ++-- Makefile | 22 +++++++++++----------- koboldcpp.py | 30 +++++++++++++++--------------- llama-util.h | 2 +- make_old_pyinstaller.bat | 2 +- make_old_pyinstaller_cuda.bat | 2 +- make_pyinstaller.bat | 2 +- make_pyinstaller.sh | 2 +- make_pyinstaller_hybrid_henk.bat | 2 +- otherarch/llama_v2-util.h | 2 +- 10 files changed, 35 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index 3fcc055cddae5..043cbe7fbfebb 100644 --- a/.gitignore +++ b/.gitignore @@ -81,12 +81,12 @@ tests/test-tokenizer-0 koboldcpp.so koboldcpp_failsafe.so koboldcpp_openblas.so -koboldcpp_openblas_noavx2.so +koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp.dll koboldcpp_failsafe.dll koboldcpp_openblas.dll -koboldcpp_openblas_noavx2.dll +koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll cublas64_11.dll diff --git a/Makefile b/Makefile index f2386457f942a..f13811a56e370 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast koboldcpp_cublas +default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt dev: koboldcpp_openblas dev2: koboldcpp_clblast @@ -213,7 +213,7 @@ endif DEFAULT_BUILD = FAILSAFE_BUILD = OPENBLAS_BUILD = -OPENBLAS_NOAVX2_BUILD = +NOAVX2_BUILD = CLBLAST_BUILD = CUBLAS_BUILD = @@ -221,7 +221,7 @@ ifeq ($(OS),Windows_NT) DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS) - OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS) + NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS) ifdef LLAMA_CUBLAS @@ -233,7 +233,7 @@ else FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) ifdef LLAMA_OPENBLAS OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif ifdef LLAMA_CLBLAST ifeq ($(UNAME_S),Darwin) @@ -283,8 +283,8 @@ ggml_openblas.o: ggml.c ggml.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_failsafe.o: ggml.c ggml.h $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ -ggml_openblas_noavx2.o: ggml.c ggml.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ +ggml_noavx2.o: ggml.c ggml.h + $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ ggml_clblast.o: ggml.c ggml.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_cublas.o: ggml.c ggml.h @@ -305,8 +305,8 @@ ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ -ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ +ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h + $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h @@ -348,7 +348,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ clean: - rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so + rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @@ -363,8 +363,8 @@ koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common $(OPENBLAS_BUILD) koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o $(OBJS) $(FAILSAFE_BUILD) -koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o $(OBJS) - $(OPENBLAS_NOAVX2_BUILD) +koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o $(OBJS) + $(NOAVX2_BUILD) koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o $(OBJS) $(CLBLAST_BUILD) koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o $(CUBLAS_OBJS) $(OBJS) diff --git a/koboldcpp.py b/koboldcpp.py index 15507e7e0b3c3..588c944dd1565 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -89,29 +89,30 @@ def pick_existant_file(ntoption,nonntoption): lib_default = pick_existant_file("koboldcpp.dll","koboldcpp.so") lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so") lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so") -lib_openblas_noavx2 = pick_existant_file("koboldcpp_openblas_noavx2.dll","koboldcpp_openblas_noavx2.so") +lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so") lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so") lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so") def init_library(): global handle - global lib_default,lib_failsafe,lib_openblas,lib_openblas_noavx2,lib_clblast,lib_cublas + global lib_default,lib_failsafe,lib_openblas,lib_noavx2,lib_clblast,lib_cublas libname = "" - use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir. + use_openblas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir. use_clblast = False #uses CLBlast instead use_cublas = False #uses cublas instead - use_noavx2 = False #uses openblas with no avx2 instructions + use_noavx2 = False #uses no avx2 instructions + use_failsafe = False #uses no intrinsics, failsafe mode if args.noavx2: use_noavx2 = True - if not file_exists(lib_openblas_noavx2) or (os.name=='nt' and not file_exists("libopenblas.dll")): - print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.") - elif args.noblas: + if not file_exists(lib_noavx2): + print("Warning: NoAVX2 library file not found. Failsafe library will be used.") + elif (args.noblas and args.nommap): + use_failsafe = True print("!!! Attempting to use FAILSAFE MODE !!!") else: - use_blas = True - print("Attempting to use non-avx2 compatibility library with OpenBLAS. A compatible libopenblas will be required.") + print("Attempting to use non-avx2 compatibility library.") elif args.useclblast: if not file_exists(lib_clblast) or (os.name=='nt' and not file_exists("clblast.dll")): print("Warning: CLBlast library file not found. Non-BLAS library will be used.") @@ -130,22 +131,22 @@ def init_library(): elif args.noblas: print("Attempting to library without OpenBLAS.") else: - use_blas = True + use_openblas = True print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.") if sys.platform=="darwin": print("Mac OSX note: Some people have found Accelerate actually faster than OpenBLAS. To compare, run Koboldcpp with --noblas instead.") if use_noavx2: - if use_blas: - libname = lib_openblas_noavx2 - else: + if use_failsafe: libname = lib_failsafe + else: + libname = lib_noavx2 else: if use_clblast: libname = lib_clblast elif use_cublas: libname = lib_cublas - elif use_blas: + elif use_openblas: libname = lib_openblas else: libname = lib_default @@ -1326,7 +1327,6 @@ def onDropdownChange(event): args.noavx2 = True args.noblas = True args.nommap = True - print("[Failsafe Mode : mmap is disabled.]") if selblaschoice==blasbatchopts[0]: args.blasbatchsize = -1 diff --git a/llama-util.h b/llama-util.h index e1e7fd748c072..7b80ddfc27af0 100644 --- a/llama-util.h +++ b/llama-util.h @@ -247,7 +247,7 @@ struct llama_mmap { #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 #else - printf("\nPrefetchVirtualMemory skipped in failsafe mode."); + printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n"); #endif } diff --git a/make_old_pyinstaller.bat b/make_old_pyinstaller.bat index e5d1113f8f4e5..c72a2ce10fa42 100644 --- a/make_old_pyinstaller.bat +++ b/make_old_pyinstaller.bat @@ -1,4 +1,4 @@ echo This file is only for my own usage, please do not use it. I am lazy. set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH% -PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp_nocuda.exe" \ No newline at end of file +PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp_nocuda.exe" \ No newline at end of file diff --git a/make_old_pyinstaller_cuda.bat b/make_old_pyinstaller_cuda.bat index 004b88bbfe276..dfe2845716149 100644 --- a/make_old_pyinstaller_cuda.bat +++ b/make_old_pyinstaller_cuda.bat @@ -1,4 +1,4 @@ echo This file is only for my own usage, please do not use it. I am lazy. set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH% -PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./nikogreen.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublas64_11.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cudart64_110.dll;." --add-data "./msvcp140.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file +PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./nikogreen.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublas64_11.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cudart64_110.dll;." --add-data "./msvcp140.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file diff --git a/make_pyinstaller.bat b/make_pyinstaller.bat index 12be110f88258..3cf867b28c3fe 100644 --- a/make_pyinstaller.bat +++ b/make_pyinstaller.bat @@ -1 +1 @@ -PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file +PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file diff --git a/make_pyinstaller.sh b/make_pyinstaller.sh index 155a1cd82bcda..270850663ee79 100644 --- a/make_pyinstaller.sh +++ b/make_pyinstaller.sh @@ -5,7 +5,7 @@ pyinstaller --noconfirm --onefile --clean --console --collect-all customtkinter --add-data "./koboldcpp.so:." \ --add-data "./koboldcpp_openblas.so:." \ --add-data "./koboldcpp_failsafe.so:." \ ---add-data "./koboldcpp_openblas_noavx2.so:." \ +--add-data "./koboldcpp_noavx2.so:." \ --add-data "./koboldcpp_clblast.so:." \ --add-data "./rwkv_vocab.embd:." \ --add-data "./rwkv_world_vocab.embd:." \ diff --git a/make_pyinstaller_hybrid_henk.bat b/make_pyinstaller_hybrid_henk.bat index be25d2ebf6deb..a99b1569dcdd7 100644 --- a/make_pyinstaller_hybrid_henk.bat +++ b/make_pyinstaller_hybrid_henk.bat @@ -2,4 +2,4 @@ cd /d "%~dp0" copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cudart64_110.dll" .\ /Y copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublasLt64_11.dll" .\ /Y copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublas64_11.dll" .\ /Y -PyInstaller --noconfirm --onefile --collect-all customtkinter --clean --console --icon ".\niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cudart64_110.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cublas64_11.dll;." --add-data "./rwkv_vocab.embd;." --add-data "C:/Windows/System32/msvcp140.dll;." --add-data "C:/Windows/System32/vcruntime140_1.dll;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file +PyInstaller --noconfirm --onefile --collect-all customtkinter --clean --console --icon ".\niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cudart64_110.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cublas64_11.dll;." --add-data "./rwkv_vocab.embd;." --add-data "C:/Windows/System32/msvcp140.dll;." --add-data "C:/Windows/System32/vcruntime140_1.dll;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file diff --git a/otherarch/llama_v2-util.h b/otherarch/llama_v2-util.h index 00aedf8e64ecd..63942a007301b 100644 --- a/otherarch/llama_v2-util.h +++ b/otherarch/llama_v2-util.h @@ -225,7 +225,7 @@ struct llama_v2_mmap { #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 #else - printf("\nPrefetchVirtualMemory skipped in failsafe mode."); + printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n"); #endif } From c8af65760f7341519b9cec205c85223949b0bb87 Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun, 30 Jul 2023 04:50:55 -0500 Subject: [PATCH 20/70] Hide unavailable backends & Add tooltip over backend count (#352) * Hide unavailable backends & Add tooltip over backend count Hides unavailable backends from the user and if the program is launched without any backends made, it shows an error message to them stating no backends were found and to make them using the 'make' command Add tooltip when hovering over backend count label hovering over the new label that shows the backend count will explain what the numbers are, and show the users which backends are not available or built * add some code comments * hide "missing" if all are built move tooltip functions to helper functions section. hides the string "Missing: ..." from showing if all backends are available " if len(runopts)==6 else + " * small typo fix * remove wrongly added leftover device choosing code * fix labels * move tooltip to function --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> --- koboldcpp.py | 119 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 85 insertions(+), 34 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 588c944dd1565..71b248b3b0df6 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -662,7 +662,6 @@ def show_new_gui(): return import customtkinter as ctk - nextstate = 0 #0=exit, 1=launch, 2=oldgui windowwidth = 520 windowheight = 500 @@ -685,13 +684,22 @@ def show_new_gui(): tabcontentframe.grid_propagate(False) tabcontent = {} - + lib_option_pairs = [ + (lib_openblas, "Use OpenBLAS"), + (lib_clblast, "Use CLBlast"), + (lib_cublas, "Use CuBLAS"), + (lib_default, "Use No BLAS"), + (lib_noavx2, "Use NoAVX2 Mode (Old CPU)"), + (lib_failsafe, "Failsafe Mode (Old CPU)")] + openblas_option, clblast_option, cublas_option, default_option, openblas_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) # slider data blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024"] blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024"] contextsize_text = ["512", "1024", "2048", "3072", "4096", "6144", "8192"] - runopts = ["Use OpenBLAS","Use CLBlast", "Use CuBLAS", "Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"] - + runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib) or os.name == 'nt' and file_exists(opt + ".dll")] + antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if not file_exists(lib) or os.name == 'nt' and not file_exists(opt + ".dll")] + if not any(runopts): + show_gui_warning("No Backend Available") def tabbuttonaction(name): for t in tabcontent: if name == t: @@ -757,6 +765,32 @@ def getfilename(var, text): button.grid(row=row+1, column=1, stick="nw") return + def show_tooltip(event, tooltip_text=None): + if hasattr(show_tooltip, "_tooltip"): + tooltip = show_tooltip._tooltip + else: + tooltip = ctk.CTkToplevel(root) + tooltip.configure(fg_color="#ffffe0") + tooltip.withdraw() + tooltip.overrideredirect(True) + tooltip_label = ctk.CTkLabel(tooltip, text=tooltip_text, text_color="#000000", fg_color="#ffffe0") + tooltip_label.pack(expand=True, padx=2, pady=1) + show_tooltip._tooltip = tooltip + x, y = root.winfo_pointerxy() + tooltip.wm_geometry(f"+{x + 10}+{y + 10}") + tooltip.deiconify() + def hide_tooltip(event): + if hasattr(show_tooltip, "_tooltip"): + tooltip = show_tooltip._tooltip + tooltip.withdraw() + def setup_backend_tooltip(parent): + num_backends_built = makelabel(parent, str(len(runopts)) + "/6", 5, 2) + num_backends_built.grid(row=1, column=2, padx=0, pady=0) + num_backends_built.configure(text_color="#00ff00") + # Bind the backend count label with the tooltip function + num_backends_built.bind("", lambda event: show_tooltip(event, f"This is the number of backends you have built and available." + (f"\nMissing: {', '.join(antirunopts)}" if len(runopts) != 6 else ""))) + num_backends_built.bind("", hide_tooltip) + # Vars - should be in scope to be used by multiple widgets gpulayers_var = ctk.StringVar(value="0") threads_var = ctk.StringVar(value=str(default_threads)) @@ -857,7 +891,10 @@ def changerunmode(a,b,c): runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=180,variable=runopts_var, state="readonly") runoptbox.grid(row=1, column=1,padx=8, stick="nw") - runoptbox.set("Use OpenBLAS") + runoptbox.set(runopts[0]) # Set to first available option + + # Tell user how many backends are available + setup_backend_tooltip(quick_tab) # threads makelabelentry(quick_tab, "Threads:" , threads_var, 8, 50) @@ -869,7 +906,6 @@ def changerunmode(a,b,c): quick_boxes = {"Launch Browser": launchbrowser , "High Priority" : highpriority, "Streaming Mode":stream, "Use SmartContext":smartcontext, "Unban Tokens":unbantokens, "Disable MMAP":disablemmap,} for idx, name, in enumerate(quick_boxes): makecheckbox(quick_tab, name, quick_boxes[name], int(idx/2) +20, idx%2) - # context size makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2) @@ -890,9 +926,13 @@ def changerunmode(a,b,c): makelabel(hardware_tab, "Presets:", 1) runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly") runoptbox.grid(row=1, column=1,padx=8, stick="nw") - runoptbox.set("Use OpenBLAS") + runoptbox.set(runopts[0]) # Set to first available option runopts_var.trace('w', changerunmode) changerunmode(1,1,1) + + # Tell user how many backends are available + setup_backend_tooltip(hardware_tab) + # threads makelabelentry(hardware_tab, "Threads:" , threads_var, 8, 50) @@ -1018,20 +1058,20 @@ def export_vars(): gpuchoiceidx = 0 if gpu_choice_var.get()!="All": gpuchoiceidx = int(gpu_choice_var.get())-1 - if runopts_var.get() == runopts[1]: + if runopts_var.get() == "Use CLBlast": args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx] - if runopts_var.get() == runopts[2]: + if runopts_var.get() == "Use CuBLAS": if gpu_choice_var.get()=="All": args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"] else: args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] if gpulayers_var.get(): args.gpulayers = int(gpulayers_var.get()) - if runopts_var.get()==runopts[3]: + if runopts_var.get()=="Use No BLAS": args.noblas = True - if runopts_var.get()==runopts[4]: + if runopts_var.get()=="Use NoAVX2 Mode (Old CPU)": args.noavx2 = True - if runopts_var.get()==runopts[5]: + if runopts_var.get()=="Failsafe Mode (Old CPU)": args.noavx2 = True args.noblas = True args.nommap = True @@ -1070,38 +1110,42 @@ def import_vars(dict): stream.set(1 if "stream" in dict and dict["stream"] else 0) smartcontext.set(1 if "smartcontext" in dict and dict["smartcontext"] else 0) unbantokens.set(1 if "unbantokens" in dict and dict["unbantokens"] else 0) - runopts_var.set(runopts[0]) if "useclblast" in dict and dict["useclblast"]: - runopts_var.set(runopts[1]) - gpu_choice_var.set(str(["0 0", "1 0", "0 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1)) + if clblast_option is not None: + runopts_var.set(clblast_option) + gpu_choice_var.set(str(["0 0", "1 0", "0 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1)) elif "usecublas" in dict and dict["usecublas"]: - runopts_var.set(runopts[2]) - if len(dict["usecublas"])==1: - lowvram_var.set(1 if dict["usecublas"][0]=="lowvram" else 0) - else: - lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0) - gpu_choice_var.set("1") - for g in range(3): - if str(g) in dict["usecublas"]: - gpu_choice_var.set(str(g+1)) - break + if cublas_option is not None: + runopts_var.set(cublas_option) + if len(dict["usecublas"])==1: + lowvram_var.set(1 if dict["usecublas"][0]=="lowvram" else 0) + else: + lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0) + gpu_choice_var.set("1") + for g in range(3): + if str(g) in dict["usecublas"]: + gpu_choice_var.set(str(g+1)) + break if "gpulayers" in dict and dict["gpulayers"]: gpulayers_var.set(dict["gpulayers"]) if "noavx2" in dict and "noblas" in dict and dict["noblas"] and dict["noavx2"]: - runopts_var.set(runopts[5]) + if failsafe_option is not None: + runopts_var.set(failsafe_option) elif "noavx2" in dict and dict["noavx2"]: - runopts_var.set(runopts[4]) + if openblas_noavx2_option is not None: + runopts_var.set(openblas_noavx2_option) elif "noblas" in dict and dict["noblas"]: - runopts_var.set(runopts[3]) + if default_option is not None: + runopts_var.set(default_option) + elif openblas_option is not None: + runopts_var.set(openblas_option) if "blasthreads" in dict and dict["blasthreads"]: blas_threads_var.set(str(dict["blasthreads"])) else: blas_threads_var.set("") - if "contextsize" in dict and dict["contextsize"]: context_var.set(contextsize_text.index(str(dict["contextsize"]))) - if "ropeconfig" in dict and dict["ropeconfig"] and len(dict["ropeconfig"])>1: if dict["ropeconfig"][0]>0: customrope_var.set(1) @@ -1199,13 +1243,20 @@ def display_help(): time.sleep(2) sys.exit(2) -def show_gui_warning(): +def show_gui_warning(issue=None): from tkinter import messagebox import tkinter as tk root = tk.Tk() root.attributes("-alpha", 0) - messagebox.showerror(title="New GUI failed, using Old GUI", message="The new GUI failed to load.\n\nTo use new GUI, please install the customtkinter python module.") - root.destroy() + if issue == "No Backend Available": + messagebox.showerror(title="No Backends Available!", message="KoboldCPP couldn't locate any backends to use.\n\nTo use the program, please run the 'make' command from the directory.") + root.destroy() + print("No Backend Available (i.e Default, OpenBLAS, CLBlast, CuBLAS). To use the program, please run the 'make' command from the directory.") + time.sleep(2) + sys.exit(2) + else: + messagebox.showerror(title="New GUI failed, using Old GUI", message="The new GUI failed to load.\n\nTo use new GUI, please install the customtkinter python module.") + root.destroy() def show_old_gui(): import tkinter as tk @@ -1236,7 +1287,7 @@ def guilaunch(): blaschoice = tk.StringVar() blaschoice.set("BLAS = 512") - runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"] + runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","Use NoAVX2 Mode (Old CPU)","Failsafe Mode (Old CPU)"] runchoice = tk.StringVar() runchoice.set("Use OpenBLAS") From 90a37d63d5711762056824ef56656d217751c31d Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 30 Jul 2023 18:07:14 +0800 Subject: [PATCH 21/70] up ver, added warning for max context --- koboldcpp.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index 71b248b3b0df6..bbd1664684e0e 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -230,12 +230,17 @@ def load_model(model_filename): return ret def generate(prompt,max_length=20, max_context_length=512, temperature=0.8, top_k=120, top_a=0.0, top_p=0.85, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], stream_sse=False): + global maxctx inputs = generation_inputs() outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs)) inputs.prompt = prompt.encode("UTF-8") if max_length >= max_context_length: max_length = max_context_length-1 inputs.max_context_length = max_context_length # this will resize the context buffer if changed + global showmaxctxwarning + if showmaxctxwarning and max_context_length > maxctx: + print(f"\n(Warning! Request max_context_length={max_context_length} exceeds allocated context size of {maxctx}. Consider launching with increased --contextsize to avoid errors. This message will only show once per session.)") + showmaxctxwarning = False inputs.max_length = max_length inputs.temperature = temperature inputs.top_k = top_k @@ -296,9 +301,10 @@ def utfprint(str): maxhordelen = 256 modelbusy = threading.Lock() defaultport = 5001 -KcppVersion = "1.37.1" +KcppVersion = "1.38" showdebug = True showsamplerwarning = True +showmaxctxwarning = True exitcounter = 0 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): From a113689571420fb4d6540f1a324d12965781356a Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 30 Jul 2023 15:58:01 +0200 Subject: [PATCH 22/70] ggml : add graph tensor allocator (#2411) * ggml : add graph tensor allocator * ggml : don't calculate data pointer of unallocated tensors when creating a view with an offset * ggml : refactor ggml_view_Nd into ggml_view_tensor_offset --- CMakeLists.txt | 2 + Makefile | 7 +- ggml-alloc.c | 541 +++++++++++++++++++++++++++++++++++++++++++++++++ ggml-alloc.h | 22 ++ ggml.c | 75 ++++--- ggml.h | 13 +- llama.cpp | 240 ++++++++++++++++------ 7 files changed, 812 insertions(+), 88 deletions(-) create mode 100644 ggml-alloc.c create mode 100644 ggml-alloc.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e1abeaa1205a..57678a302d097 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -503,6 +503,8 @@ endif() add_library(ggml OBJECT ggml.c ggml.h + ggml-alloc.c + ggml-alloc.h ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_SOURCES_METAL} diff --git a/Makefile b/Makefile index 3d1fff8e55eee..616c2d9b88be9 100644 --- a/Makefile +++ b/Makefile @@ -329,7 +329,12 @@ $(info ) ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ -llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h +ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h + $(CC) $(CFLAGS) -c $< -o $@ + +OBJS += ggml-alloc.o + +llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h diff --git a/ggml-alloc.c b/ggml-alloc.c new file mode 100644 index 0000000000000..5e1be61ff6cef --- /dev/null +++ b/ggml-alloc.c @@ -0,0 +1,541 @@ +#include "ggml-alloc.h" +#include "ggml.h" +#include +#include +#include +#include +#include + +#define UNUSED(x) (void)(x) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +//#define GGML_ALLOCATOR_DEBUG + +//#define AT_PRINTF printf +#define AT_PRINTF(...) ((void)0) + +struct hash_node { + struct ggml_tensor * t; + int n_children; + int n_views; +}; + +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} + +static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) { + size_t h = hash(t); + + // linear probing + size_t i = h; + while (hash_table[i].t != NULL) { + if (hash_table[i].t == t) { + return &hash_table[i]; + } + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // hash table is full + GGML_ASSERT(false); + } + } + + hash_table[i].t = t; + return &hash_table[i]; +} + +// TODO: GGML_PAD ? +static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { + assert(alignment && !(alignment & (alignment - 1))); // power of 2 + size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; + return offset + align; +} + +struct free_block { + void * addr; + size_t size; +}; + +#define MAX_FREE_BLOCKS 128 + +struct ggml_allocr { + void * data; + size_t size; + size_t alignment; + int n_free_blocks; + struct free_block free_blocks[MAX_FREE_BLOCKS]; + struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; + size_t max_size; + bool measure; + +#ifdef GGML_ALLOCATOR_DEBUG + struct ggml_tensor * allocated_tensors[1024]; +#endif +}; + +#ifdef GGML_ALLOCATOR_DEBUG +static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == NULL) { + alloc->allocated_tensors[i] = tensor; + return; + } + } + GGML_ASSERT(!"out of allocated_tensors"); +} +static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == tensor || + (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { + alloc->allocated_tensors[i] = NULL; + return; + } + } + printf("tried to free tensor %s not found\n", tensor->name); + GGML_ASSERT(!"tensor not found"); +} +#endif + + +static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + return ggml_nbytes(tensor); + + UNUSED(alloc); +} + +void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + size_t size = ggml_allocator_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + + AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); + + size_t max_avail = 0; + + // find the best fitting free block + int best_fit_block = -1; + size_t best_fit_size = SIZE_MAX; + for (int i = 0; i < alloc->n_free_blocks; i++) { + struct free_block * block = &alloc->free_blocks[i]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size && block->size <= best_fit_size) { + best_fit_block = i; + best_fit_size = block->size; + } + } + + AT_PRINTF("block %d\n", best_fit_block); + + if (best_fit_block == -1) { + fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", + __func__, size, max_avail); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } + struct free_block * block = &alloc->free_blocks[best_fit_block]; + void * addr = block->addr; + block->addr = (char*)block->addr + size; + block->size -= size; + if (block->size == 0) { + // remove block if empty + alloc->n_free_blocks--; + for (int j = best_fit_block; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + + tensor->data = addr; + +#ifdef GGML_ALLOCATOR_DEBUG + add_allocated_tensor(alloc, tensor); + size_t cur_max = (char*)addr - (char*)alloc->data + size; + if (cur_max > alloc->max_size) { + printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i]) { + printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); + } + } + printf("\n"); + } +#endif + + alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size); +} + +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + void * ptr = tensor->data; + + if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) { + // the tensor was not allocated in this buffer + // this can happen because the graph allocator will try to free weights and other tensors from different buffers + // the easiest way to deal with this is just to ignore it + return; + } + + size_t size = ggml_allocator_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); + +#ifdef GGML_ALLOCATOR_DEBUG + remove_allocated_tensor(alloc, tensor); +#endif + + // see if we can merge with an existing block + for (int i = 0; i < alloc->n_free_blocks; i++) { + struct free_block * block = &alloc->free_blocks[i]; + // check if ptr is at the end of the block + if ((char*)block->addr + block->size == ptr) { + block->size += size; + // check if we can merge with the next block + if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) { + block->size += alloc->free_blocks[i+1].size; + alloc->n_free_blocks--; + for (int j = i+1; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + // check if ptr is at the beginning of the block + if ((char*)ptr + size == block->addr) { + block->addr = ptr; + block->size += size; + // check if we can merge with the previous block + if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) { + alloc->free_blocks[i-1].size += block->size; + alloc->n_free_blocks--; + for (int j = i; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + } + // otherwise, add a new block + GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); + // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) + int insert_pos = 0; + while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) { + insert_pos++; + } + // shift all blocks from insert_pos onward to make room for the new block + for (int i = alloc->n_free_blocks; i > insert_pos; i--) { + alloc->free_blocks[i] = alloc->free_blocks[i-1]; + } + // insert the new block + alloc->free_blocks[insert_pos].addr = ptr; + alloc->free_blocks[insert_pos].size = size; + alloc->n_free_blocks++; +} + +void ggml_allocr_reset(struct ggml_allocr * alloc) { + alloc->n_free_blocks = 1; + size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); + alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; + alloc->free_blocks[0].size = alloc->size - align_offset; +} + +struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + + *alloc = (struct ggml_allocr){ + /*.data = */ data, + /*.size = */ size, + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.hash_table = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ false, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ = {0}, +#endif + }; + + ggml_allocr_reset(alloc); + + return alloc; +} + +// address and size of the buffer when measuring +// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers +static void * const MEASURE_BASE_ADDR = (void *) 0x1000; +static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB + +struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + + *alloc = (struct ggml_allocr){ + /*.data = */ MEASURE_BASE_ADDR, + /*.size = */ MEASURE_MAX_SIZE, + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.hash_table = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ true, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ = {0}, +#endif + }; + + ggml_allocr_reset(alloc); + + return alloc; +} + +void ggml_allocr_free(struct ggml_allocr * alloc) { + free(alloc); +} + +bool ggml_allocr_is_measure(struct ggml_allocr * alloc) { + return alloc->measure; +} + +//////////// compute graph allocator + +static bool ggml_is_view(struct ggml_tensor * t) { + return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || + t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; +} + +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { + switch (t->op) { + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + return t->src[0]; + case GGML_OP_CPY: + return t->src[1]; + default: + return NULL; + } +} + +static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { + struct ggml_tensor * parent = t; + do { + parent = get_view_parent(parent); + } while (ggml_is_view(parent)); + return parent; +} + +static bool ggml_op_can_inplace(enum ggml_op op) { + switch (op) { + case GGML_OP_SCALE: + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_ACC: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_UNARY: + case GGML_OP_ROPE: + case GGML_OP_RMS_NORM: + case GGML_OP_SET: + case GGML_OP_SOFT_MAX: + case GGML_OP_CONT: + return true; + + default: + return false; + } +} + +static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) { + struct hash_node * ht = alloc->hash_table; + if (node->data == NULL) { + if (ggml_is_view(node)) { + size_t offset; + switch(node->op) { + case GGML_OP_VIEW: + memcpy(&offset, node->op_params, sizeof(size_t)); + node->data = (char *) node->src[0]->data + offset; + break; + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + node->data = node->src[0]->data; + break; + case GGML_OP_CPY: + node->data = node->src[1]->data; + break; + default: + GGML_ASSERT(!"unknown view op"); + break; + } + } else { + // see if we can reuse a parent's buffer (inplace) + if (ggml_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = hash_get(ht, parent); + if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = get_view_source(parent); + struct hash_node * view_src_hn = hash_get(ht, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite + // the parent's data that it will need later (same layout requirement). the problem is that then + // we cannot free the tensor because the original address of the allocation is lost. + // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views + // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + node->data = parent->data; + return; + } + } + else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + node->data = parent->data; + } + return; + } + } + } + ggml_allocr_alloc(alloc, node); + } + } +} + +static size_t ggml_allocator_alloc_graph_tensors_n( + struct ggml_allocr * alloc, + struct ggml_cgraph ** graphs, int n_graphs, + struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { + + // reset hash table + struct hash_node * ht = alloc->hash_table; + memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE); + + // count number of children and views + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + if (ggml_is_view(node)) { + struct ggml_tensor * view_src = get_view_source(node); + hash_get(ht, view_src)->n_views += 1; + } + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + hash_get(ht, parent)->n_children += 1; + } + } + } + + // allocate tensors + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + AT_PRINTF("####### graph %d/%d\n", g, n_graphs); + // graph inputs are allocated first to ensure that they are not overwritten by each other + if (inputs != NULL && inputs[g] != NULL) { + for (int i = 0; inputs[g][i] != NULL; i++) { + struct ggml_tensor * input = inputs[g][i]; + AT_PRINTF("input: %s\n", input->name); + allocate_node(alloc, input); + } + } + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + // allocate parents (leafs) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + allocate_node(alloc, parent); + } + + // allocate node + allocate_node(alloc, node); + + AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } + } + AT_PRINTF("\n"); + + // update parents + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = hash_get(ht, parent); + p_hn->n_children -= 1; + + //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = get_view_source(parent); + struct hash_node * view_src_hn = hash_get(ht, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { + ggml_allocator_free_tensor(alloc, view_src); + } + } + else { + if (parent->data != node->data) { + ggml_allocator_free_tensor(alloc, parent); + } + } + } + } + AT_PRINTF("\n"); + } + // free graph outputs here that wouldn't be freed otherwise because they have no children + if (outputs != NULL && outputs[g] != NULL) { + for (int i = 0; outputs[g][i] != NULL; i++) { + struct ggml_tensor * output = outputs[g][i]; + AT_PRINTF("output: %s\n", output->name); + ggml_allocator_free_tensor(alloc, output); + } + } + } + + return alloc->max_size; +} + +size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { + return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); +} diff --git a/ggml-alloc.h b/ggml-alloc.h new file mode 100644 index 0000000000000..a5ec8f87a9453 --- /dev/null +++ b/ggml-alloc.h @@ -0,0 +1,22 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); +GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); + +GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); +GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); +GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index b77f9926754ed..fa0f98aa09df2 100644 --- a/ggml.c +++ b/ggml.c @@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml static struct ggml_tensor * ggml_new_tensor_impl( struct ggml_context * ctx, - enum ggml_type type, - int n_dims, - const int64_t* ne, - void* data) { + enum ggml_type type, + int n_dims, + const int64_t * ne, + void * data) { + + assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); size_t data_size = 0; @@ -4648,22 +4650,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3 struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, - enum ggml_type type, - int n_dims, - const int64_t * ne) { + enum ggml_type type, + int n_dims, + const int64_t * ne) { return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); } struct ggml_tensor * ggml_new_tensor_1d( struct ggml_context * ctx, - enum ggml_type type, + enum ggml_type type, int64_t ne0) { return ggml_new_tensor(ctx, type, 1, &ne0); } struct ggml_tensor * ggml_new_tensor_2d( struct ggml_context * ctx, - enum ggml_type type, + enum ggml_type type, int64_t ne0, int64_t ne1) { const int64_t ne[2] = { ne0, ne1 }; @@ -4672,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d( struct ggml_tensor * ggml_new_tensor_3d( struct ggml_context * ctx, - enum ggml_type type, + enum ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) { @@ -6238,6 +6240,27 @@ struct ggml_tensor * ggml_reshape_4d( // ggml_view_1d +static struct ggml_tensor * ggml_view_tensor_offset( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_dims, + const int64_t * ne, + size_t offset) { + // don't calculate an offset from an unallocated tensor + void * data = NULL; + if (a->data != NULL) { + data = (char *) a->data + offset; + } + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data); + + ggml_format_name(result, "%s (view)", a->name); + + ggml_set_op_params(result, &offset, sizeof(offset)); + + return result; +} + struct ggml_tensor * ggml_view_1d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -6250,10 +6273,7 @@ struct ggml_tensor * ggml_view_1d( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); + struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset); result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6280,10 +6300,7 @@ struct ggml_tensor * ggml_view_2d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); + struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset); result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; @@ -6316,10 +6333,7 @@ struct ggml_tensor * ggml_view_3d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); + struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset); result->nb[1] = nb1; result->nb[2] = nb2; @@ -6354,10 +6368,7 @@ struct ggml_tensor * ggml_view_4d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); + struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset); result->nb[1] = nb1; result->nb[2] = nb2; @@ -6741,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace( return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true); } +struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false); +} + struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ggml.h b/ggml.h index 9919cce7c263f..aba92480c833c 100644 --- a/ggml.h +++ b/ggml.h @@ -1170,7 +1170,18 @@ extern "C" { int mode, int n_ctx); - // custom RoPE, in-place, returns view(a) + // custom RoPE + GGML_API struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale); + + // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/llama.cpp b/llama.cpp index a35c690ea9f68..6f381f30f8fe1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -56,8 +56,14 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) +#include "ggml-alloc.h" +#define LLAMA_USE_ALLOCATOR +#else #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 +#endif + // available llama models enum e_model { @@ -327,13 +333,22 @@ struct llama_model { struct llama_context { llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} -#ifdef GGML_USE_METAL ~llama_context() { + if (model_owner) { + delete &model; + } +#ifdef GGML_USE_METAL if (ctx_metal) { ggml_metal_free(ctx_metal); } - } #endif +#ifdef LLAMA_USE_ALLOCATOR + if (alloc) { + ggml_allocr_free(alloc); + } +#endif + } + std::mt19937 rng; bool has_evaluated_once = false; @@ -371,7 +386,17 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state llama_ctx_buffer buf_compute; + +#ifdef LLAMA_USE_ALLOCATOR + llama_ctx_buffer buf_alloc; + ggml_allocr * alloc = NULL; +#endif + +#ifdef LLAMA_USE_SCRATCH llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + int buf_last = 0; + size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; +#endif #ifdef GGML_USE_METAL ggml_metal_context * ctx_metal = NULL; @@ -381,9 +406,6 @@ struct llama_context { ggml_mpi_context * ctx_mpi = NULL; #endif - int buf_last = 0; - size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; - void use_buf(struct ggml_context * ctx, int i) { #if defined(LLAMA_USE_SCRATCH) size_t last_size = 0; @@ -1230,12 +1252,16 @@ static void llama_model_load_internal( const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; // this is the total memory required to run the inference - const size_t mem_required = + size_t mem_required = ctx_size + - mmapped_size - vram_weights + // weights in VRAM not in memory + mmapped_size - vram_weights; // weights in VRAM not in memory + +#ifndef LLAMA_USE_ALLOCATOR + mem_required += MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_EVAL().at(model.type); +#endif // this is the memory required by one llama_state const size_t mem_required_state = @@ -1360,32 +1386,15 @@ static bool llama_model_load( } } -// evaluate the transformer -// -// - lctx: llama context -// - tokens: new batch of tokens to process -// - embd embeddings input -// - n_tokens number of tokens -// - n_past: the context size so far -// - n_threads: number of threads to use -// -static bool llama_eval_internal( +static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_token * tokens, const float * embd, int n_tokens, - int n_past, - int n_threads, - const char * cgraph_fname) { + int n_past) { LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); -#ifdef GGML_USE_MPI - ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); -#endif - - const int64_t t_start_us = ggml_time_us(); - const int N = n_tokens; const auto & model = lctx.model; @@ -1401,10 +1410,8 @@ static bool llama_eval_internal( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_vocab = hparams.n_vocab; const int64_t n_embd_gqa = hparams.n_embd_gqa(); - LLAMA_ASSERT(n_embd_head == hparams.n_rot); const float freq_base = hparams.rope_freq_base; @@ -1416,26 +1423,35 @@ static bool llama_eval_internal( auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; + struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.addr, /*.no_alloc =*/ false, }; +#ifdef LLAMA_USE_ALLOCATOR + params.no_alloc = true; +#endif + struct ggml_context * ctx0 = ggml_init(params); ggml_cgraph * gf = ggml_new_graph(ctx0); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; - struct ggml_tensor * cur; struct ggml_tensor * inpL; if (tokens) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } +#else memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); +#endif ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); @@ -1445,7 +1461,15 @@ static bool llama_eval_internal( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } +#else memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); +#endif } const int i_gpu_start = n_layer - n_gpu_layers; @@ -1472,6 +1496,17 @@ static bool llama_eval_internal( } #endif // GGML_USE_CUBLAS + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } +#else + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); +#endif + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + for (int il = 0; il < n_layer; ++il) { ggml_format_name(inpL, "layer_inp_%d", il); @@ -1567,9 +1602,6 @@ static bool llama_eval_internal( ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) - struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - // KQ_scaled shape [n_past + N, N, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); offload_func_kq(KQ_scaled); @@ -1685,9 +1717,6 @@ static bool llama_eval_internal( lctx.use_buf(ctx0, 0); - // used at the end to optionally extract the embeddings - struct ggml_tensor * embeddings = NULL; - // norm { cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); @@ -1698,8 +1727,6 @@ static bool llama_eval_internal( cur = ggml_mul(ctx0, cur, model.norm); // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); - - embeddings = cur; } // lm_head @@ -1711,12 +1738,82 @@ static bool llama_eval_internal( // logits -> probs //cur = ggml_soft_max_inplace(ctx0, cur); - // run the computation ggml_build_forward_expand(gf, cur); - // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + +#if 0 + printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, + ggml_used_mem(ctx0)/1024.0/1024.0, + lctx.get_buf_max_mem(0)/1024.0/1024.0, + lctx.get_buf_max_mem(1)/1024.0/1024.0, + lctx.work_buffer.size()/1024.0/1024.0, + n_past, N); +#endif + + ggml_free(ctx0); + + return gf; +} + +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - embd embeddings input +// - n_tokens number of tokens +// - n_past: the context size so far +// - n_threads: number of threads to use +// +static bool llama_eval_internal( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past, + int n_threads, + const char * cgraph_fname) { + + LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); + + const int64_t t_start_us = ggml_time_us(); + +#ifdef GGML_USE_MPI + ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); +#endif + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + LLAMA_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_vocab = hparams.n_vocab; + +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocr_reset(lctx.alloc); +#endif + + ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); + +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocr_alloc_graph(lctx.alloc, gf); +#endif + + // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; #if GGML_USE_MPI + const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif @@ -1760,6 +1857,10 @@ static bool llama_eval_internal( lctx.kv_self.n = n_past + N; struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + + LLAMA_ASSERT(strcmp(res->name, "result_output") == 0); + LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0); if (cgraph_fname) { ggml_graph_export(gf, cgraph_fname); @@ -1798,21 +1899,6 @@ static bool llama_eval_internal( memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); } - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - -#if 0 - printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, - ggml_used_mem(ctx0)/1024.0/1024.0, - lctx.get_buf_max_mem(0)/1024.0/1024.0, - lctx.get_buf_max_mem(1)/1024.0/1024.0, - lctx.work_buffer.size()/1024.0/1024.0, - n_past, N); -#endif - - ggml_free(ctx0); - // measure the performance only for the single-token evals if (N == 1) { lctx.t_eval_us += ggml_time_us() - t_start_us; @@ -3180,10 +3266,47 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } +#ifdef LLAMA_USE_ALLOCATOR + { + static const size_t tensor_alignment = 32; + // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data + ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + + // create measure allocator + ctx->alloc = ggml_allocr_new_measure(tensor_alignment); + + // build worst-case graph + int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); + int n_past = hparams.n_ctx - n_tokens; + llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); + + // measure memory requirements for the graph + size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; + + fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + + // debug - for comparison with scratch buffer + //size_t prev_req = + // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + + // MEM_REQ_SCRATCH1().at(ctx->model.type) + + // MEM_REQ_EVAL().at(ctx->model.type); + //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); + + // recreate allocator with exact memory requirements + ggml_allocr_free(ctx->alloc); + + ctx->buf_alloc.resize(alloc_size); + ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); + } +#else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); +#endif +#ifdef LLAMA_USE_SCRATCH ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); +#endif } #ifdef GGML_USE_METAL @@ -3253,9 +3376,6 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { - if (ctx->model_owner) { - delete &ctx->model; - } delete ctx; } From 5ea5d19d6a1690348f0a93ee6bfce147e129e461 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 30 Jul 2023 22:31:20 +0800 Subject: [PATCH 23/70] SSE emoji fix --- koboldcpp.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index bbd1664684e0e..d76a8c282e22d 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -400,6 +400,7 @@ async def handle_sse_stream(self): current_token = 0 + incomplete_token_buffer = bytearray() while not handle.has_finished(): if current_token < handle.get_stream_count(): token = handle.new_token(current_token) @@ -409,10 +410,14 @@ async def handle_sse_stream(self): current_token += 1 - tokenStr = ctypes.string_at(token).decode("UTF-8","ignore") - event_data = {"token": tokenStr} - event_str = json.dumps(event_data) - await self.send_sse_event("message", event_str) + newbyte = ctypes.string_at(token) + incomplete_token_buffer += bytearray(newbyte) + tokenStr = incomplete_token_buffer.decode("UTF-8","ignore") + if tokenStr!="": + incomplete_token_buffer.clear() + event_data = {"token": tokenStr} + event_str = json.dumps(event_data) + await self.send_sse_event("message", event_str) await asyncio.sleep(0) From f27972777f5f81a5c75068fbbeff8e637ec21555 Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon, 31 Jul 2023 02:51:35 -0500 Subject: [PATCH 24/70] correct semantic error in import_vars (#355) * Hide unavailable backends & Add tooltip over backend count Hides unavailable backends from the user and if the program is launched without any backends made, it shows an error message to them stating no backends were found and to make them using the 'make' command Add tooltip when hovering over backend count label hovering over the new label that shows the backend count will explain what the numbers are, and show the users which backends are not available or built * add some code comments * hide "missing" if all are built move tooltip functions to helper functions section. hides the string "Missing: ..." from showing if all backends are available " if len(runopts)==6 else + " * small typo fix * remove wrongly added leftover device choosing code * fix labels * move tooltip to function * import vars logic fix --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> --- koboldcpp.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index d76a8c282e22d..e9d5cd4ce647e 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1137,10 +1137,7 @@ def import_vars(dict): if str(g) in dict["usecublas"]: gpu_choice_var.set(str(g+1)) break - if "gpulayers" in dict and dict["gpulayers"]: - gpulayers_var.set(dict["gpulayers"]) - - if "noavx2" in dict and "noblas" in dict and dict["noblas"] and dict["noavx2"]: + elif "noavx2" in dict and "noblas" in dict and dict["noblas"] and dict["noavx2"]: if failsafe_option is not None: runopts_var.set(failsafe_option) elif "noavx2" in dict and dict["noavx2"]: @@ -1151,6 +1148,8 @@ def import_vars(dict): runopts_var.set(default_option) elif openblas_option is not None: runopts_var.set(openblas_option) + if "gpulayers" in dict and dict["gpulayers"]: + gpulayers_var.set(dict["gpulayers"]) if "blasthreads" in dict and dict["blasthreads"]: blas_threads_var.set(str(dict["blasthreads"])) else: From 9d2382b3e45b5815fc6a054045a2f2c2b18c22a2 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 31 Jul 2023 11:02:53 +0200 Subject: [PATCH 25/70] Fix Metal backend broken from the allocator changes (#2455) * fix Metal backend broken from the allocator changes --- llama.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index 6f381f30f8fe1..50da4274fa298 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1812,6 +1812,12 @@ static bool llama_eval_internal( // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + + LLAMA_ASSERT(strcmp(res->name, "result_output") == 0); + LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + #if GGML_USE_MPI const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); @@ -1825,7 +1831,10 @@ static bool llama_eval_internal( //} ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, gf); - ggml_metal_get_tensor (lctx.ctx_metal, cur); + ggml_metal_get_tensor (lctx.ctx_metal, res); + if (!lctx.embedding.empty()) { + ggml_metal_get_tensor(lctx.ctx_metal, embeddings); + } } else { // IMPORTANT: // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla @@ -1856,12 +1865,6 @@ static bool llama_eval_internal( // update kv token count lctx.kv_self.n = n_past + N; - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - - LLAMA_ASSERT(strcmp(res->name, "result_output") == 0); - LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0); - if (cgraph_fname) { ggml_graph_export(gf, cgraph_fname); } From 84ce184c4f1fe9fc973d4bf013fe490b1d441ecf Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 31 Jul 2023 17:33:31 +0800 Subject: [PATCH 26/70] layout --- koboldcpp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index e9d5cd4ce647e..2c4d679bb6df3 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -674,7 +674,7 @@ def show_new_gui(): import customtkinter as ctk nextstate = 0 #0=exit, 1=launch, 2=oldgui - windowwidth = 520 + windowwidth = 530 windowheight = 500 ctk.set_appearance_mode("dark") root = ctk.CTk() @@ -700,7 +700,7 @@ def show_new_gui(): (lib_clblast, "Use CLBlast"), (lib_cublas, "Use CuBLAS"), (lib_default, "Use No BLAS"), - (lib_noavx2, "Use NoAVX2 Mode (Old CPU)"), + (lib_noavx2, "NoAVX2 Mode (Old CPU)"), (lib_failsafe, "Failsafe Mode (Old CPU)")] openblas_option, clblast_option, cublas_option, default_option, openblas_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) # slider data @@ -1080,7 +1080,7 @@ def export_vars(): args.gpulayers = int(gpulayers_var.get()) if runopts_var.get()=="Use No BLAS": args.noblas = True - if runopts_var.get()=="Use NoAVX2 Mode (Old CPU)": + if runopts_var.get()=="NoAVX2 Mode (Old CPU)": args.noavx2 = True if runopts_var.get()=="Failsafe Mode (Old CPU)": args.noavx2 = True @@ -1297,7 +1297,7 @@ def guilaunch(): blaschoice = tk.StringVar() blaschoice.set("BLAS = 512") - runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","Use NoAVX2 Mode (Old CPU)","Failsafe Mode (Old CPU)"] + runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","NoAVX2 Mode (Old CPU)","Failsafe Mode (Old CPU)"] runchoice = tk.StringVar() runchoice.set("Use OpenBLAS") From 2dbf518911926ef5a30f43aa83a0b1b1cdeaaa11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 31 Jul 2023 13:18:51 +0200 Subject: [PATCH 27/70] CUDA: fewer memory bank conflicts for mul_mat_q (#2458) --- ggml-cuda.cu | 608 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 484 insertions(+), 124 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0a43fb5dad0f2..3f111565ae444 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -162,7 +162,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_ typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc); typedef void (*load_tiles_cuda_t)( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row); + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row); typedef float (*vec_dot_q_mul_mat_cuda_t)( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k); @@ -1397,8 +1397,8 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0)]; + __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0]; *x_ql = tile_x_qs; *x_dm = tile_x_d; @@ -1406,26 +1406,61 @@ static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q4_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI4_0; const int kqsx = k % QI4_0; - const block_q4_0 * bx = ((block_q4_0 *) vx) + i*blocks_per_row + kbx; + const block_q4_0 * bx0 = (block_q4_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx].x = bxi->d; + } + +// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; +// const int kbxd = k % blocks_per_tile_x_row; + +// #pragma unroll +// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) { +// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; + +// if (i >= GGML_CUDA_MMQ_Y) { +// return; +// } + +// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->qs, kqsx); - x_dm[i * (WARP_SIZE / QI4_0) + kbx].x = bx->d; +// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d; +// } } static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); return vec_dot_q4_0_q8_1_impl( x_ql[i * (WARP_SIZE + 1) + k], y_qs[j * (2*WARP_SIZE) + kyqs], y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], - x_dm[i * (WARP_SIZE/QI4_0) + k/QI4_0].x, y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + x_dm[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0].x, y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); } #define VDR_q4_1_q8_1 1 @@ -1471,8 +1506,8 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1)]; + __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1]; *x_ql = tile_x_qs; *x_dm = tile_x_dm; @@ -1480,26 +1515,56 @@ static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q4_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI4_1; const int kqsx = k % QI4_1; - const block_q4_1 * bx = ((block_q4_1 *) vx) + i*blocks_per_row + kbx; + const block_q4_1 * bx0 = (block_q4_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) { + const int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bx->qs, kqsx); - x_dm[i * (WARP_SIZE / QI4_1) + kbx] = bx->dm; + x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + } } static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); return vec_dot_q4_1_q8_1_impl( x_ql[i * (WARP_SIZE + 1) + k], y_qs[j * (2*WARP_SIZE) + kyqs], y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], - x_dm[i * (WARP_SIZE/QI4_1) + k/QI4_1], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); } #define VDR_q5_0_q8_1 1 @@ -1543,9 +1608,9 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0)]; - __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0)]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; + __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; *x_ql = tile_x_ql; *x_qh = tile_x_qh; @@ -1554,24 +1619,54 @@ static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q5_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI5_0; const int kqsx = k % QI5_0; - const block_q5_0 * bx = ((block_q5_0 *) vx) + i*blocks_per_row + kbx; + const block_q5_0 * bx0 = (block_q5_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int kbxd = k % blocks_per_tile_x_row; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->qs, kqsx); - x_qh[i * (WARP_SIZE / QI5_0) + kbx] = get_int_from_uint8(bx->qh, 0); - x_dm[i * (WARP_SIZE / QI5_0) + kbx].x = bx->d; +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) { + const int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_qh[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = get_int_from_uint8(bxi->qh, 0); + x_dm[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd].x = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_0) + k/QI5_0; + const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; return vec_dot_q5_0_q8_1_impl( x_ql[i * (WARP_SIZE + 1) + k], x_qh[index_bx] >> (4 * (k % QI5_0)), y_qs[j * (2*WARP_SIZE) + kyqs], @@ -1629,9 +1724,9 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1)]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1)]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE ) + GGML_CUDA_MMQ_Y]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1]; *x_ql = tile_x_ql; *x_qh = tile_x_qh; @@ -1640,24 +1735,54 @@ static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q5_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI5_1; const int kqsx = k % QI5_1; - const block_q5_1 * bx = ((block_q5_1 *) vx) + i*blocks_per_row + kbx; + const block_q5_1 * bx0 = (block_q5_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->qs, kqsx); - x_qh[i * (WARP_SIZE / QI5_1) + kbx] = get_int_from_uint8(bx->qh, 0); - x_dm[i * (WARP_SIZE / QI5_1) + kbx] = bx->dm; + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) { + const int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_qh[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = get_int_from_uint8_aligned(bxi->qh, 0); + x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + } } static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_0) + k/QI5_0; + const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; return vec_dot_q5_1_q8_1_impl( x_ql[i * (WARP_SIZE + 1) + k], x_qh[index_bx] >> (4 * (k % QI5_1)), y_qs[j * (2*WARP_SIZE) + kyqs], @@ -1692,8 +1817,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0)]; + __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0]; *x_ql = tile_x_qs; *x_dm = tile_x_d; @@ -1701,24 +1826,61 @@ static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q8_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI8_0; const int kqsx = k % QI8_0; - const block_q8_0 * bx = ((block_q8_0 *) vx) + i*blocks_per_row + kbx; + const block_q8_0 * bx0 = (block_q8_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx].x = bxi->d; + } + +// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; +// const int kbxd = k % blocks_per_tile_x_row; + +// #pragma unroll +// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) { +// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bx->qs, kqsx); - x_dm[i * (WARP_SIZE / QI8_0) + kbx].x = bx->d; +// #if GGML_CUDA_MMQ_Y < 64 +// if (i >= GGML_CUDA_MMQ_Y) { +// return; +// } +// #endif // GGML_CUDA_MMQ_Y < 64 + +// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + +// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d; +// } } static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + return vec_dot_q8_0_q8_1_impl( x_ql[i * (WARP_SIZE + 1) + k], y_qs[j*WARP_SIZE + k], - x_dm[i * (WARP_SIZE/QI8_0) + k/QI8_0].x, y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]); + x_dm[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0].x, y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]); } #define VDR_q2_K_q8_1 1 @@ -1776,9 +1938,9 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE / QI2_K)]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE / 4)]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1787,25 +1949,59 @@ static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q2_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI2_K; const int kqsx = k % QI2_K; - const block_q2_K * bx = ((block_q2_K *) vx) + i*blocks_per_row + kbx; + const block_q2_K * bx0 = (block_q2_K *) vx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bx->qs, kqsx); - x_dm[i * (WARP_SIZE / QI2_K) + kbx] = bx->dm; - x_sc[i * (WARP_SIZE / 4) + k/4] = get_int_from_uint8_aligned(bx->scales, kqsx / 4); +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) { + const int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + const int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + } } static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k < WARP_SIZE); + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI2_K; const int kqsx = k % QI2_K; @@ -1813,7 +2009,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int bq8_offset = QR2_K * (kqsx / QI8_1); const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2); - const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4))) + kbx*16 + scale_offset; + const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset; int u[QR2_K]; float d8[QR2_K]; @@ -1824,7 +2020,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( d8[l] = y_ds[y_qs_index / QI8_1].x; } - return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + kbx], d8); + return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8); } #define VDR_q3_K_q8_1 1 @@ -1892,10 +2088,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE / QI2_K)]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE / 2)]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE / 4)]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1905,33 +2101,79 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q3_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI3_K; const int kqsx = k % QI3_K; - const block_q3_K * bx = ((block_q3_K *) vx) + i*blocks_per_row + kbx; + const block_q3_K * bx0 = (block_q3_K *) vx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->qs, kqsx); - x_dm[i * (WARP_SIZE / QI3_K) + kbx].x = bx->d; - x_qh[i * (WARP_SIZE / 2) + k/2] = get_int_from_uint8(bx->hmask, kqsx / 2); - x_sc[i * (WARP_SIZE / 4) + k/4] = get_int_from_uint8(bx->scales, kqsx / 4); +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) { + const int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) { + const int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + } + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + const int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4)); + } } static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + const int kbx = k / QI3_K; const int kqsx = k % QI3_K; const int bq8_offset = QR3_K * (kqsx / (QI3_K/2)); const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2); - const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4))) + kbx*16; + const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16; // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - const int vh = ~x_qh[i * (WARP_SIZE/2) + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset; + const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset; int u[QR3_K]; float d8[QR3_K]; @@ -1942,7 +2184,8 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( d8[l] = y_ds[y_qs_index / QI8_1].x; } - return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset, x_dm[i * (WARP_SIZE/QI3_K) + kbx].x, d8); + return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset, + x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8); } #define VDR_q4_K_q8_1 2 @@ -2068,9 +2311,9 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K)]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (3*WARP_SIZE/32)]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -2079,25 +2322,59 @@ static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q4_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); - const int kbx = k / QI4_K; - const int kqsx = k % QI4_K; + const int kbx = k / QI4_K; // == 0 if QK_K == 256 + const int kqsx = k % QI4_K; // == k if QK_K == 256 - const block_q4_K * bx = ((block_q4_K *) vx) + i*blocks_per_row + kbx; + const block_q4_K * bx0 = (block_q4_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bx->qs, kqsx); - x_dm[i * (WARP_SIZE / QI6_K) + kbx] = bx->dm; - x_sc[i * (3*WARP_SIZE/32) + k % (3*WARP_SIZE/32)] = get_int_from_uint8_aligned(bx->scales, k % (3*WARP_SIZE/32)); + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) { + const int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { + const int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8)); + } } static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k < WARP_SIZE); + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256 @@ -2112,7 +2389,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 0]; v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 4]; - const uint16_t * scales = (const uint16_t *) &x_sc[i * (3*WARP_SIZE/32) + kbx * (3*WARP_SIZE/32)]; + const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4]; uint16_t aux[2]; const int l = bq8_offset/2; if (l < 2) { @@ -2132,7 +2409,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( d8[l] = y_ds[kqsy / QI8_1].x; } - return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + kbx], d8); + return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8); } #define VDR_q5_K_q8_1 2 @@ -2260,10 +2537,10 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K)]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4)]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (3*WARP_SIZE/32)]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -2273,26 +2550,68 @@ static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q5_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI5_K; // == 0 if QK_K == 256 + const int kqsx = k % QI5_K; // == k if QK_K == 256 + + const block_q5_K * bx0 = (block_q5_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; - const int kbx = k / QI5_K; - const int kqsx = k % QI5_K; + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) { + const int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + const int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4); - const block_q5_K * bx = ((block_q5_K *) vx) + i*blocks_per_row + kbx; + x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4)); + } + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { + const int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bx->qs, kqsx); - x_dm[i * (WARP_SIZE / QI6_K) + kbx] = bx->dm; - x_qh[i * (WARP_SIZE / 4) + k/4] = get_int_from_uint8_aligned(bx->qh, kqsx/4); - x_sc[i * (3*WARP_SIZE/32) + k % (3*WARP_SIZE/32)] = get_int_from_uint8_aligned(bx->scales, k % (3*WARP_SIZE/32)); + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8)); + } } static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i < 2*WARP_SIZE); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k < WARP_SIZE); + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256 @@ -2307,10 +2626,10 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 0]; vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 4]; - vh[0] = x_qh[i * (WARP_SIZE/4) + kqsx % 4 + 0] >> bq8_offset; - vh[1] = x_qh[i * (WARP_SIZE/4) + kqsx % 4 + 4] >> bq8_offset; + vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + kqsx % 4 + 0] >> bq8_offset; + vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + kqsx % 4 + 4] >> bq8_offset; - const uint16_t * scales = (const uint16_t *) &x_sc[i * (3*WARP_SIZE/32) + kbx * (3*WARP_SIZE/32)]; + const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4]; uint16_t aux[2]; const int l = bq8_offset/2; if (l < 2) { @@ -2330,7 +2649,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( d8[l] = y_ds[kqsy / QI8_1].x; } - return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + kbx], d8); + return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8); } #define VDR_q6_K_q8_1 1 @@ -2387,10 +2706,10 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE + 1)]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K)]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2)]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8)]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K]; + __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2]; + __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -2400,26 +2719,68 @@ static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** static __device__ __forceinline__ void load_tiles_q6_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) { + int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) { - const int kbx = k / QI6_K; - const int kqsx = k % QI6_K; + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < 8); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); - const block_q6_K * bx = ((block_q6_K *) vx) + i*blocks_per_row + kbx; + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + const block_q6_K * bx0 = (block_q6_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + const int i = i0 + i_offset; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bx->ql, kqsx); - x_dm[i * (WARP_SIZE / QI6_K) + kbx].x = bx->d; - x_qh[i * (WARP_SIZE / 2) + k/2] = get_int_from_uint8(bx->qh, kqsx/2); - x_sc[i * (WARP_SIZE / 8) + k/8] = get_int_from_int8(bx->scales, kqsx/8); + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) { + const int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) { + const int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2); + + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2)); + } + +#pragma unroll + for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { + const int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + } } static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k < WARP_SIZE); + __builtin_assume(i >= 0); + __builtin_assume(i < GGML_CUDA_MMQ_Y); + __builtin_assume(j >= 0); + __builtin_assume(j < WARP_SIZE); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256 @@ -2428,9 +2789,9 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8); const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)); - const int vh = x_qh[i * (WARP_SIZE/2) + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift; + const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift; - const int x_sc_offset = i * (WARP_SIZE/8) + kbx * (QI6_K/8); + const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8); const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset; int u[QR6_K]; @@ -2442,7 +2803,8 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( d8[l] = y_ds[kqsy / QI8_1].x; } - return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, x_dm[i * (WARP_SIZE/QI6_K) + kbx].x, d8); + return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, + x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8); } template qs, tid_x % QI8_1); } From 1215ed7d5ccf854a55eccb52661427bb985e7f2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 31 Jul 2023 14:32:30 +0200 Subject: [PATCH 28/70] CUDA: Implemented row flattening for non-glm RoPE (#2468) --- ggml-cuda.cu | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3f111565ae444..bcdff3640c13d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3150,7 +3150,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, } // rope == RoPE == rotary positional embedding -static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) { +static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale) { const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x); if (col >= ncols) { @@ -3160,7 +3161,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c const int row = blockDim.y*blockIdx.y + threadIdx.y; const int i = row*ncols + col; - const float theta = p*powf(theta_scale, col/2); + const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); const float sin_theta = sinf(theta); const float cos_theta = cosf(theta); @@ -3764,12 +3765,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons scale_f32<<>>(x, dst, scale, k); } -static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) { +static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { GGML_ASSERT(nrows % 2 == 0); const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(num_blocks_x, nrows, 1); - rope_f32<<>>(x, dst, ncols, p, theta_scale); + rope_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); } static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { @@ -4465,6 +4467,7 @@ inline void ggml_cuda_op_rope( GGML_ASSERT(dst_ddf_i != nullptr); const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; const int64_t i01_diff = i01_high - i01_low; const int n_past = ((int32_t *) dst->op_params)[0]; @@ -4478,17 +4481,18 @@ inline void ggml_cuda_op_rope( memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); - const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale; - bool is_glm = mode & 4; + const bool is_glm = mode & 4; // compute if (is_glm) { + const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale; const float id_p = min(p, n_ctx - 2.f); const float block_p = max(p - (n_ctx - 2.f), 0.f); rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); } else { - rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); + const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; + rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); } (void) src1; @@ -5103,7 +5107,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results + + const int mode = ((int32_t *) dst->op_params)[2]; + const bool is_glm = mode & 4; + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm } void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { From 0728c5a8b9569183ffca0399caac099afef87595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 31 Jul 2023 15:44:35 +0200 Subject: [PATCH 29/70] CUDA: mmq CLI option, fixed mmq build issues (#2453) --- CMakeLists.txt | 16 ++++++++++------ Makefile | 6 +++--- README.md | 4 +++- examples/common.cpp | 16 +++++++++++++--- examples/common.h | 1 + examples/server/server.cpp | 15 +++++++++++++-- ggml-cuda.cu | 24 ++++++++++++++---------- ggml-cuda.h | 1 + llama.cpp | 10 ++++++++-- llama.h | 1 + 10 files changed, 67 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57678a302d097..4ecb3d5862a0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,7 @@ option(LLAMA_ACCELERATE "llama: enable Accelerate framework option(LLAMA_BLAS "llama: use BLAS" OFF) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use CUDA" OFF) -option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) +#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels") option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") @@ -253,9 +253,9 @@ if (LLAMA_CUBLAS) set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) add_compile_definitions(GGML_USE_CUBLAS) - if (LLAMA_CUDA_CUBLAS) - add_compile_definitions(GGML_CUDA_CUBLAS) - endif() +# if (LLAMA_CUDA_CUBLAS) +# add_compile_definitions(GGML_CUDA_CUBLAS) +# endif() add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) @@ -277,10 +277,14 @@ if (LLAMA_CUBLAS) endif() if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # 52 == lowest CUDA 12 standard + # 60 == f16 CUDA intrinsics + # 61 == integer CUDA intrinsics + # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster if (LLAMA_CUDA_DMMV_F16) - set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics + set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics else() - set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics + set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") diff --git a/Makefile b/Makefile index 616c2d9b88be9..ebeadfdd05fd5 100644 --- a/Makefile +++ b/Makefile @@ -236,9 +236,9 @@ ifdef LLAMA_CUDA_MMQ_Y else NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64 endif # LLAMA_CUDA_MMQ_Y -ifdef LLAMA_CUDA_CUBLAS - NVCCFLAGS += -DGGML_CUDA_CUBLAS -endif # LLAMA_CUDA_CUBLAS +#ifdef LLAMA_CUDA_CUBLAS +# NVCCFLAGS += -DGGML_CUDA_CUBLAS +#endif # LLAMA_CUDA_CUBLAS ifdef LLAMA_CUDA_CCBIN NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif diff --git a/README.md b/README.md index 42fc42b05ae56..b231d24b81859 100644 --- a/README.md +++ b/README.md @@ -400,9 +400,11 @@ Building the program with BLAS support may lead to some performance improvements The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: + | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). | | LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | diff --git a/examples/common.cpp b/examples/common.cpp index fe7308b1787eb..e6439841dc145 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -352,7 +352,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { #ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); #endif } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { @@ -376,13 +376,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } } #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--mul-mat-q" || arg == "-mmq") { +#ifdef GGML_USE_CUBLAS + params.mul_mat_q = true; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--low-vram" || arg == "-lv") { #ifdef GGML_USE_CUBLAS params.low_vram = true; #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; @@ -585,6 +591,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); + fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); + fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); + fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); #endif fprintf(stdout, " --mtest compute maximum memory usage\n"); fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); @@ -637,6 +646,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param lparams.main_gpu = params.main_gpu; lparams.tensor_split = params.tensor_split; lparams.low_vram = params.low_vram; + lparams.mul_mat_q = params.mul_mat_q; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.use_mmap = params.use_mmap; diff --git a/examples/common.h b/examples/common.h index 1184f32df5074..974484207251d 100644 --- a/examples/common.h +++ b/examples/common.h @@ -74,6 +74,7 @@ struct gpt_params { size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score bool low_vram = false; // if true, reduce VRAM usage at the cost of performance + bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 83c03065a5d58..c0725088f1018 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -631,6 +631,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); + fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); + fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); + fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); #endif fprintf(stdout, " -m FNAME, --model FNAME\n"); fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); @@ -827,7 +830,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } #else - LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {}); + LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); #endif // GGML_USE_CUBLAS } else if (arg == "--low-vram" || arg == "-lv") @@ -835,7 +838,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, #ifdef GGML_USE_CUBLAS params.low_vram = true; #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); + LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {}); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--mul-mat-q" || arg == "-mmq") + { +#ifdef GGML_USE_CUBLAS + params.mul_mat_q = true; +#else + LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {}); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bcdff3640c13d..f11fbe57c1112 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3898,10 +3898,9 @@ static size_t g_scratch_offset = 0; static int g_device_count = -1; static int g_main_device = 0; -#ifndef GGML_CUDA_FORCE_DMMV static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; -#endif static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static bool g_mul_mat_q = false; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -3923,9 +3922,7 @@ void ggml_init_cublas() { g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; -#ifndef GGML_CUDA_FORCE_DMMV g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; -#endif } for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; @@ -4278,6 +4275,7 @@ inline void ggml_cuda_op_mul_mat_vec( #ifdef GGML_CUDA_FORCE_DMMV const bool use_mul_mat_vec_q = false; + (void) g_compute_capabilities[0]; #else int id; CUDA_CHECK(cudaGetDevice(&id)); @@ -5021,12 +5019,14 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false); } else { -#ifdef GGML_CUDA_CUBLAS - const bool use_mul_mat_q = false; -#else - const bool use_mul_mat_q = ggml_is_quantized(src0->type); -#endif // GGML_CUDA_CUBLAS - if (use_mul_mat_q) { + int min_compute_capability = INT_MAX; + for (int id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_compute_capabilities[id]) { + min_compute_capability = g_compute_capabilities[id]; + } + } + + if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) { ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false); } else { ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); @@ -5320,6 +5320,10 @@ void ggml_cuda_set_main_device(int main_device) { } } +void ggml_cuda_set_mul_mat_q(bool mul_mat_q) { + g_mul_mat_q = mul_mat_q; +} + void ggml_cuda_set_scratch_size(size_t scratch_size) { g_scratch_size = scratch_size; } diff --git a/ggml-cuda.h b/ggml-cuda.h index 3c1e8deb6a6dd..72d7afa463d74 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); void ggml_cuda_set_main_device(int main_device); +void ggml_cuda_set_mul_mat_q(bool mul_mat_q); void ggml_cuda_set_scratch_size(size_t scratch_size); void ggml_cuda_free_scratch(void); bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); diff --git a/llama.cpp b/llama.cpp index 50da4274fa298..d427054dd8d6a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -901,6 +901,7 @@ struct llama_context_params llama_context_default_params() { /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, + /*.mul_mat_q =*/ false, /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, @@ -1028,6 +1029,7 @@ static void llama_model_load_internal( int n_gpu_layers, int main_gpu, const float * tensor_split, + const bool mul_mat_q, float rope_freq_base, float rope_freq_scale, bool low_vram, @@ -1156,9 +1158,11 @@ static void llama_model_load_internal( } (void) main_gpu; + (void) mul_mat_q; #if defined(GGML_USE_CUBLAS) fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); + ggml_cuda_set_mul_mat_q(mul_mat_q); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #elif defined(GGML_USE_CLBLAST) @@ -1367,6 +1371,7 @@ static bool llama_model_load( int n_gpu_layers, int main_gpu, const float * tensor_split, + const bool mul_mat_q, float rope_freq_base, float rope_freq_scale, bool low_vram, @@ -1377,7 +1382,8 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, + main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -3192,7 +3198,7 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, + params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { delete model; diff --git a/llama.h b/llama.h index df46f9b9cfa31..fa1977f2d9492 100644 --- a/llama.h +++ b/llama.h @@ -108,6 +108,7 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool low_vram; // if true, reduce VRAM usage at the cost of performance + bool mul_mat_q; // if true, use experimental mul_mat_q kernels bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights From b772bba42e3bbca3cdab224456f8ff2ce427fd0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 31 Jul 2023 19:52:22 +0200 Subject: [PATCH 30/70] CUDA: fixed cmake F16 option (#2471) --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ecb3d5862a0d..1d4e63f3e72c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,7 +73,7 @@ set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA ke option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") -option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) +option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" OFF) @@ -265,8 +265,8 @@ if (LLAMA_CUBLAS) if (DEFINED LLAMA_CUDA_DMMV_Y) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility endif() - if (LLAMA_CUDA_DMMV_F16) - add_compile_definitions(GGML_CUDA_DMMV_F16) + if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) + add_compile_definitions(GGML_CUDA_F16) endif() add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) From 49e7cb5bb1f75c91dd5db7d2d88cbc11bd9ee0c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 31 Jul 2023 21:02:19 +0200 Subject: [PATCH 31/70] CUDA: fixed LLAMA_FAST compilation option (#2473) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ebeadfdd05fd5..100614b4bb871 100644 --- a/Makefile +++ b/Makefile @@ -243,7 +243,7 @@ ifdef LLAMA_CUDA_CCBIN NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ + $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS ifdef LLAMA_CLBLAST From 1873ff586bd8499a18f763632711bf15d253585e Mon Sep 17 00:00:00 2001 From: Matteo Boschini <12133566+mbosc@users.noreply.github.com> Date: Tue, 1 Aug 2023 09:43:12 +0200 Subject: [PATCH 32/70] metal : add gqa8 kernel to allow llama-2-70B on metal (#2459) * Added gqa8 kernel to allow llama-2-70B on metal * Update ggml-metal.m Co-authored-by: Cebtenzzre * Extend kernel_mul_mat_f16_f32 to handle gqa broadcast * Added ne03==ne13 assertion --------- Co-authored-by: Cebtenzzre --- ggml-metal.m | 33 +++++++++++++++++---------------- ggml-metal.metal | 5 ++++- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 74a6bff404117..3f098d39677a0 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -718,7 +718,8 @@ void ggml_metal_graph_compute( // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 GGML_ASSERT(ne00 == ne10); - GGML_ASSERT(ne02 == ne12); + // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere + GGML_ASSERT(ne03 == ne13); if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && @@ -746,11 +747,11 @@ void ggml_metal_graph_compute( initWithDevice:ctx->device transposeLeft:false transposeRight:true resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0]; - // we need to do ne02 multiplications + // we need to do ne12 multiplications // TODO: is there a way to do this in parallel - currently very slow .. // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS - for (int64_t i02 = 0; i02 < ne02; ++i02) { - size_t offs_src0_cur = offs_src0 + i02*nb02; + for (int64_t i02 = 0; i02 < ne12; ++i02) { + size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now size_t offs_src1_cur = offs_src1 + i02*nb12; size_t offs_dst_cur = offs_dst + i02*nb2; @@ -772,8 +773,6 @@ void ggml_metal_graph_compute( switch (src0t) { case GGML_TYPE_F16: { - GGML_ASSERT(ne02 == ne12); - nth0 = 64; nth1 = 1; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; @@ -853,16 +852,18 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { diff --git a/ggml-metal.metal b/ggml-metal.metal index 696b33ce75cf4..8d26b5ec2dfa4 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32( device float * dst, constant int64_t & ne00, constant int64_t & ne01, + constant int64_t & ne02, constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, constant int64_t & ne10, constant int64_t & ne11, + constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, @@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32( const int64_t r1 = tgpig.y; const int64_t im = tgpig.z; - device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02); + device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); sum[tpitg.x] = 0.0f; @@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32( } } + kernel void kernel_alibi_f32( device const float * src0, device float * dst, From 86aeb27734481751592abd85590020b40d9224c8 Mon Sep 17 00:00:00 2001 From: ebraminio Date: Tue, 1 Aug 2023 01:56:23 -0700 Subject: [PATCH 33/70] server : Support dark mode (#2414) * server : Support dark mode So it respects user system light / dark settings. * Update index.html.hpp by running ./deps.sh --- examples/server/index.html.hpp | 2126 ++++++++++++++--------------- examples/server/public/index.html | 3 +- 2 files changed, 1064 insertions(+), 1065 deletions(-) diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp index 1b21d4d55550b..5bca2a8ab03ac 100644 --- a/examples/server/index.html.hpp +++ b/examples/server/index.html.hpp @@ -10,1144 +10,1144 @@ unsigned char index_html[] = { 0x74, 0x69, 0x61, 0x6c, 0x2d, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x3d, 0x31, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x2d, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x3d, 0x31, 0x22, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, - 0x3c, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x2e, 0x63, 0x70, 0x70, 0x20, 0x2d, 0x20, 0x63, 0x68, 0x61, 0x74, 0x3c, - 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c, - 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62, - 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, - 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x66, 0x66, 0x66, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, - 0x20, 0x23, 0x30, 0x30, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, - 0x3a, 0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6d, 0x2d, 0x75, 0x69, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, - 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x39, 0x30, 0x25, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x23, 0x63, - 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, - 0x20, 0x30, 0x65, 0x6d, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, - 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, 0x6d, 0x6e, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, 0x73, 0x74, - 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3a, - 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74, 0x77, 0x65, - 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, - 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, - 0x61, 0x69, 0x6e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x33, 0x70, 0x78, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, - 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, - 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, - 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, - 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, - 0x74, 0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74, - 0x77, 0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x3b, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, 0x72, - 0x6f, 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x2d, 0x79, 0x3a, - 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x31, 0x70, - 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, 0x63, 0x63, 0x63, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, - 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x35, - 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, - 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, + 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x65, + 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3d, 0x22, 0x6c, + 0x69, 0x67, 0x68, 0x74, 0x20, 0x64, 0x61, 0x72, 0x6b, 0x22, 0x3e, 0x0a, + 0x20, 0x20, 0x3c, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x6c, 0x6c, 0x61, + 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x2d, 0x20, 0x63, 0x68, 0x61, + 0x74, 0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, + 0x20, 0x3c, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6d, 0x61, 0x78, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, - 0x20, 0x36, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, - 0x20, 0x33, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x2d, 0x68, 0x65, 0x69, 0x67, 0x68, - 0x74, 0x3a, 0x20, 0x31, 0x2e, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x20, - 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x20, 0x30, - 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x2d, - 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2d, - 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x77, 0x6f, 0x72, 0x64, 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, 0x62, - 0x72, 0x65, 0x61, 0x6b, 0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x79, 0x70, 0x68, 0x65, 0x6e, 0x73, - 0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x74, 0x6f, 0x70, - 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x62, 0x6f, - 0x74, 0x74, 0x6f, 0x6d, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x23, 0x77, 0x72, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, - 0x69, 0x6e, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x20, 0x30, 0x20, - 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, - 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, - 0x69, 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, - 0x6c, 0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x67, 0x61, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x2d, - 0x69, 0x74, 0x65, 0x6d, 0x73, 0x3a, 0x20, 0x73, 0x74, 0x72, 0x65, 0x74, - 0x63, 0x68, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x2e, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, + 0x79, 0x3a, 0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6d, 0x2d, 0x75, 0x69, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, + 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x39, 0x30, 0x25, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x23, + 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, + 0x3a, 0x20, 0x30, 0x65, 0x6d, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x72, 0x6f, 0x77, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x30, - 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6a, 0x75, 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, - 0x65, 0x6e, 0x74, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x65, 0x6e, - 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, - 0x72, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, - 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, - 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, - 0x73, 0x65, 0x74, 0x2e, 0x74, 0x77, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, - 0x20, 0x67, 0x72, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x67, 0x72, 0x69, 0x64, 0x2d, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x3a, 0x20, 0x22, 0x61, 0x20, 0x61, 0x22, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, - 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x2e, 0x74, - 0x68, 0x72, 0x65, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x67, 0x72, - 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, - 0x69, 0x64, 0x2d, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, - 0x20, 0x22, 0x61, 0x20, 0x61, 0x20, 0x61, 0x22, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, 0x6d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, - 0x20, 0x31, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, - 0x61, 0x61, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, - 0x6f, 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, - 0x3a, 0x20, 0x34, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, - 0x35, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, - 0x6e, 0x2d, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x77, 0x65, - 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x62, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, - 0x3a, 0x20, 0x2d, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x2d, 0x30, 0x2e, - 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, - 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x75, 0x72, 0x73, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x6f, 0x69, 0x6e, 0x74, - 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x5b, 0x6f, - 0x70, 0x65, 0x6e, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, - 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, - 0x61, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, - 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x35, 0x70, 0x78, 0x3b, 0x0a, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, 0x6d, + 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, 0x73, + 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74, 0x77, + 0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, + 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x33, 0x70, 0x78, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, + 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, + 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, + 0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, + 0x75, 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x6e, 0x74, 0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, + 0x74, 0x77, 0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, 0x72, 0x6f, 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x31, 0x30, 0x30, - 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x70, 0x72, 0x65, 0x20, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, - 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, - 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, - 0x32, 0x32, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x64, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, - 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, 0x20, - 0x6d, 0x6f, 0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, - 0x3a, 0x20, 0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, 0x65, - 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, + 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x2d, 0x79, + 0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x31, + 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, 0x63, 0x63, + 0x63, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, - 0x33, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, - 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x35, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, + 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, + 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x62, 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x6d, 0x61, 0x78, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, + 0x3a, 0x20, 0x36, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, + 0x3a, 0x20, 0x33, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x2d, 0x68, 0x65, 0x69, 0x67, + 0x68, 0x74, 0x3a, 0x20, 0x31, 0x2e, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, + 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x20, + 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, + 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, + 0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x77, 0x6f, 0x72, 0x64, 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, + 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x79, 0x70, 0x68, 0x65, 0x6e, + 0x73, 0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x74, 0x6f, + 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x62, + 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x23, 0x77, 0x72, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, + 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x20, 0x30, + 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, + 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, + 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, + 0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, + 0x2d, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x3a, 0x20, 0x73, 0x74, 0x72, 0x65, + 0x74, 0x63, 0x68, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, + 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, + 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x72, 0x6f, 0x77, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, + 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x6a, 0x75, 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x6e, 0x74, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x65, + 0x6e, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, + 0x65, 0x72, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, + 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, + 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, + 0x64, 0x73, 0x65, 0x74, 0x2e, 0x74, 0x77, 0x6f, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, + 0x3a, 0x20, 0x67, 0x72, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x67, 0x72, 0x69, 0x64, 0x2d, 0x74, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x61, 0x20, 0x61, 0x22, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, + 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x2e, + 0x74, 0x68, 0x72, 0x65, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x67, + 0x72, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, + 0x72, 0x69, 0x64, 0x2d, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x3a, 0x20, 0x22, 0x61, 0x20, 0x61, 0x20, 0x61, 0x22, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, + 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, + 0x3a, 0x20, 0x31, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, + 0x23, 0x61, 0x61, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, + 0x73, 0x3a, 0x20, 0x34, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, + 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, + 0x69, 0x6e, 0x2d, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, + 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x77, + 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x62, 0x6f, 0x6c, 0x64, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, + 0x6e, 0x3a, 0x20, 0x2d, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x2d, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, - 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2c, - 0x20, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2d, 0x61, 0x6c, 0x69, - 0x67, 0x6e, 0x3a, 0x20, 0x63, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, - 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, - 0x20, 0x38, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x38, 0x38, 0x38, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, - 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c, 0x73, 0x63, - 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x6d, - 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x20, 0x68, 0x2c, 0x20, - 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x65, 0x66, 0x66, 0x65, - 0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, - 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x20, 0x75, 0x73, - 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x75, 0x73, 0x65, - 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x52, - 0x65, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, - 0x6d, 0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x2e, 0x6a, 0x73, - 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, - 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x7d, - 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, - 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, - 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, + 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, + 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x75, 0x72, 0x73, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x6f, 0x69, 0x6e, + 0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x5b, + 0x6f, 0x70, 0x65, 0x6e, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, + 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, + 0x65, 0x61, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, + 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x35, 0x70, 0x78, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, + 0x67, 0x72, 0x6f, 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x31, 0x30, + 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x70, 0x72, 0x65, 0x20, 0x63, 0x6f, 0x64, 0x65, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, + 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, + 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, + 0x23, 0x32, 0x32, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x64, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, + 0x20, 0x6d, 0x6f, 0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, + 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, + 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, + 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, + 0x20, 0x33, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, + 0x74, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, + 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, + 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, + 0x2c, 0x20, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2d, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x3a, 0x20, 0x63, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, + 0x3a, 0x20, 0x38, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x38, 0x38, 0x38, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x3c, 0x2f, + 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c, 0x73, + 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, + 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x20, 0x68, 0x2c, + 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x65, 0x66, 0x66, + 0x65, 0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, + 0x64, 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x20, 0x75, + 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x75, 0x73, + 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x75, 0x73, 0x65, + 0x52, 0x65, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x66, 0x72, + 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x2e, 0x6a, + 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, + 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, + 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, + 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x3b, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x73, 0x69, + 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x22, 0x54, 0x68, + 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x76, + 0x65, 0x72, 0x73, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x65, 0x74, + 0x77, 0x65, 0x65, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x61, 0x20, 0x66, + 0x72, 0x69, 0x65, 0x6e, 0x64, 0x6c, 0x79, 0x20, 0x63, 0x68, 0x61, 0x74, + 0x62, 0x6f, 0x74, 0x2e, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x64, + 0x20, 0x69, 0x6e, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x6d, + 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x2e, 0x22, 0x2c, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, + 0x7d, 0x7d, 0x5c, 0x6e, 0x5c, 0x6e, 0x7b, 0x7b, 0x68, 0x69, 0x73, 0x74, + 0x6f, 0x72, 0x79, 0x7d, 0x7d, 0x5c, 0x6e, 0x7b, 0x7b, 0x63, 0x68, 0x61, + 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x6e, 0x61, 0x6d, + 0x65, 0x7d, 0x7d, 0x3a, 0x20, 0x7b, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x7d, 0x7d, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3a, + 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, + 0x79, 0x70, 0x65, 0x3a, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x2c, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x72, 0x3a, + 0x20, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x22, 0x2c, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x3a, 0x20, 0x22, 0x55, + 0x73, 0x65, 0x72, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x22, 0x54, 0x68, 0x69, - 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, - 0x72, 0x73, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x65, 0x74, 0x77, - 0x65, 0x65, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x64, - 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x61, 0x20, 0x66, 0x72, - 0x69, 0x65, 0x6e, 0x64, 0x6c, 0x79, 0x20, 0x63, 0x68, 0x61, 0x74, 0x62, - 0x6f, 0x74, 0x2e, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x64, 0x20, - 0x69, 0x6e, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x6d, 0x61, - 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x2e, 0x22, 0x2c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x7d, - 0x7d, 0x5c, 0x6e, 0x5c, 0x6e, 0x7b, 0x7b, 0x68, 0x69, 0x73, 0x74, 0x6f, - 0x72, 0x79, 0x7d, 0x7d, 0x5c, 0x6e, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, - 0x7d, 0x7d, 0x3a, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x6e, 0x61, 0x6d, 0x65, - 0x7d, 0x7d, 0x3a, 0x20, 0x7b, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x7d, 0x7d, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3a, 0x20, - 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, - 0x70, 0x65, 0x3a, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x2c, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x72, 0x3a, 0x20, - 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x3a, 0x20, 0x22, 0x55, 0x73, - 0x65, 0x72, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, - 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, - 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x3a, 0x20, 0x34, 0x30, - 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a, 0x20, 0x30, 0x2e, - 0x37, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, - 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x3a, 0x20, - 0x32, 0x35, 0x36, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x20, 0x3d, 0x20, - 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61, - 0x6c, 0x74, 0x79, 0x2c, 0x20, 0x2d, 0x31, 0x20, 0x3d, 0x20, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, - 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x31, 0x2e, 0x31, - 0x38, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, + 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x3a, 0x20, 0x34, + 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a, 0x20, 0x30, + 0x2e, 0x37, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x3a, + 0x20, 0x32, 0x35, 0x36, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x20, 0x3d, + 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x70, 0x65, 0x6e, + 0x61, 0x6c, 0x74, 0x79, 0x2c, 0x20, 0x2d, 0x31, 0x20, 0x3d, 0x20, 0x63, + 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, + 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x31, 0x2e, + 0x31, 0x38, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, + 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x3a, 0x20, 0x34, + 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x3c, 0x3d, 0x20, 0x30, 0x20, 0x74, + 0x6f, 0x20, 0x75, 0x73, 0x65, 0x20, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x20, + 0x73, 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, + 0x6f, 0x70, 0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x2c, 0x20, 0x2f, + 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, + 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, + 0x66, 0x73, 0x5f, 0x7a, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, + 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, + 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, + 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x3a, 0x20, 0x31, 0x2e, + 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x3a, 0x20, 0x34, 0x30, - 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x3c, 0x3d, 0x20, 0x30, 0x20, 0x74, 0x6f, - 0x20, 0x75, 0x73, 0x65, 0x20, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x20, 0x73, - 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, - 0x70, 0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x2c, 0x20, 0x2f, 0x2f, - 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, - 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x66, - 0x73, 0x5f, 0x7a, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, - 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, - 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, - 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x3a, 0x20, 0x31, 0x2e, 0x30, - 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, + 0x20, 0x20, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, + 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, + 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, - 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, - 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, - 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, - 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, - 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, - 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x3a, 0x20, 0x30, - 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2f, 0x31, 0x2f, 0x32, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, - 0x74, 0x5f, 0x74, 0x61, 0x75, 0x3a, 0x20, 0x35, 0x2c, 0x20, 0x2f, 0x2f, - 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x65, 0x6e, 0x74, 0x72, - 0x6f, 0x70, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, - 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x3a, 0x20, - 0x30, 0x2e, 0x31, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6c, 0x65, 0x61, 0x72, - 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, - 0x74, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, - 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, - 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, - 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, - 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, - 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x63, 0x6f, 0x6e, + 0x20, 0x20, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, + 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, + 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, + 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x3a, 0x20, + 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2f, 0x31, 0x2f, 0x32, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, + 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x3a, 0x20, 0x35, 0x2c, 0x20, 0x2f, + 0x2f, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x65, 0x6e, 0x74, + 0x72, 0x6f, 0x70, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, + 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x3a, + 0x20, 0x30, 0x2e, 0x31, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6c, 0x65, 0x61, + 0x72, 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x61, 0x74, 0x65, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, + 0x61, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, + 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, + 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, + 0x74, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, + 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x20, 0x29, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, + 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x3d, + 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, + 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, + 0x3e, 0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, + 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, + 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, + 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x74, + 0x72, 0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, + 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x74, + 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x78, 0x74, + 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, + 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, + 0x2e, 0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x2c, 0x20, + 0x2e, 0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, + 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, + 0x73, 0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, + 0x41, 0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, 0x5c, 0x7b, 0x28, 0x2e, 0x2a, + 0x3f, 0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, 0x67, 0x2c, 0x20, 0x28, 0x5f, + 0x2c, 0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x74, 0x74, 0x69, + 0x6e, 0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, 0x5d, 0x29, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, + 0x68, 0x61, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, + 0x28, 0x6d, 0x73, 0x67, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x20, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, - 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, - 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, - 0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, - 0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, - 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x74, - 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, - 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, + 0x28, 0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, + 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, + 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, + 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, + 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, + 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, + 0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x75, 0x73, + 0x65, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x5d, + 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, + 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2c, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, + 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, + 0x70, 0x74, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x28, + 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, + 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, + 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x7d, 0x29, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x22, + 0x5c, 0x6e, 0x22, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, + 0x65, 0x74, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, + 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73, 0x65, + 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, - 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x74, 0x72, - 0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, - 0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, - 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x78, 0x74, 0x72, - 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x74, - 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, - 0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x2c, 0x20, 0x2e, - 0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, - 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, - 0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x41, - 0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, 0x5c, 0x7b, 0x28, 0x2e, 0x2a, 0x3f, - 0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, 0x67, 0x2c, 0x20, 0x28, 0x5f, 0x2c, - 0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, 0x5d, 0x29, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, - 0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, - 0x61, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, - 0x6d, 0x73, 0x67, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, - 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, - 0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, - 0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, + 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, + 0x3e, 0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x28, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, + 0x29, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, + 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, + 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, + 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, + 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, + 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, + 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3a, + 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63, 0x68, 0x75, + 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, + 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x64, + 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, + 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x20, 0x6c, 0x65, 0x61, 0x64, + 0x69, 0x6e, 0x67, 0x20, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, + 0x63, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, + 0x65, 0x20, 0x3d, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, + 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, + 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, 0x2b, 0x2f, 0x2c, 0x20, 0x22, + 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, + 0x74, 0x6f, 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x63, 0x68, + 0x61, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, + 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x5d, 0x5d, 0x29, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, + 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, + 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, + 0x20, 0x66, 0x69, 0x6e, 0x69, 0x73, 0x68, 0x65, 0x64, 0x3a, 0x20, 0x27, + 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, + 0x73, 0x73, 0x61, 0x67, 0x65, 0x2c, 0x20, 0x22, 0x27, 0x2c, 0x20, 0x73, + 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x64, + 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, + 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, + 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x3d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, + 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, - 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, - 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, - 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, - 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, - 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x5d, 0x29, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x3a, - 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, - 0x74, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x28, 0x5b, - 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, - 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, - 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x7d, 0x29, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x22, 0x5c, - 0x6e, 0x22, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, - 0x74, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, - 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68, - 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, - 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, - 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, - 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, - 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, - 0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, - 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, - 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x22, - 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x5d, - 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, - 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, - 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61, - 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, - 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3a, 0x20, - 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, - 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x64, 0x61, - 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, - 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x20, 0x6c, 0x65, 0x61, 0x64, 0x69, - 0x6e, 0x67, 0x20, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, - 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, - 0x20, 0x3d, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, - 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, 0x2b, 0x2f, 0x2c, 0x20, 0x22, 0x22, - 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, 0x74, - 0x6f, 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, - 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, - 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x5d, 0x5d, 0x29, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, - 0x22, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x66, 0x69, 0x6e, 0x69, 0x73, 0x68, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x22, - 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, - 0x73, 0x61, 0x67, 0x65, 0x2c, 0x20, 0x22, 0x27, 0x2c, 0x20, 0x73, 0x75, - 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x64, 0x61, - 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, - 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, - 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, - 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, - 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, - 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, - 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, 0x70, 0x75, - 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, - 0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20, + 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, + 0x61, 0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74, 0x6f, 0x70, + 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, + 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, + 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, + 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, 0x65, - 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, - 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, + 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, + 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, + 0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x68, 0x61, 0x74, 0x28, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x22, + 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, + 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x66, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x68, + 0x69, 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x33, 0x20, 0x26, + 0x26, 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x68, 0x69, + 0x66, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, + 0x74, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x65, 0x74, 0x20, 0x3d, - 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, - 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, - 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x68, 0x61, 0x74, 0x28, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x22, 0x22, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, - 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, 0x20, - 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x68, 0x69, - 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x33, 0x20, 0x26, 0x26, - 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x68, 0x69, 0x66, - 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, - 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d, - 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x3d, 0x24, 0x7b, - 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, - 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x72, - 0x6f, 0x77, 0x73, 0x3d, 0x32, 0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, 0x70, - 0x72, 0x65, 0x73, 0x73, 0x3d, 0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, 0x72, - 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, 0x7d, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x65, 0x73, - 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, - 0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x7d, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f, - 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, 0x61, 0x79, 0x20, 0x73, 0x6f, - 0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x22, 0x2f, + 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, + 0x6d, 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x3d, 0x24, + 0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c, - 0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, 0x67, 0x68, 0x74, 0x22, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, 0x70, - 0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x22, 0x20, 0x64, - 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x21, 0x67, - 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x7d, 0x20, 0x3e, 0x53, 0x65, 0x6e, 0x64, 0x3c, 0x2f, - 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, - 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, - 0x24, 0x7b, 0x73, 0x74, 0x6f, 0x70, 0x7d, 0x20, 0x64, 0x69, 0x73, 0x61, - 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72, - 0x61, 0x74, 0x69, 0x6e, 0x67, 0x7d, 0x3e, 0x53, 0x74, 0x6f, 0x70, 0x3c, + 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, + 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, + 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x32, 0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, + 0x70, 0x72, 0x65, 0x73, 0x73, 0x3d, 0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, + 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, 0x7d, 0x20, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x3d, 0x24, 0x7b, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x65, + 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x3d, 0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, + 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, 0x61, 0x79, 0x20, 0x73, + 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x22, + 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, + 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, 0x67, 0x68, 0x74, 0x22, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, + 0x70, 0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x22, 0x20, + 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x21, + 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x20, 0x3e, 0x53, 0x65, 0x6e, 0x64, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, - 0x3d, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65, - 0x73, 0x65, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, - 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, - 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, - 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, - 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x75, - 0x73, 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, - 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, - 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x6f, - 0x74, 0x74, 0x6f, 0x6d, 0x20, 0x28, 0x69, 0x66, 0x20, 0x6e, 0x65, 0x65, - 0x64, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, - 0x26, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, - 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, - 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x3c, 0x3d, 0x20, - 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, - 0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, - 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, - 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, - 0x20, 0x2b, 0x20, 0x33, 0x30, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, - 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, - 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, 0x28, 0x30, - 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, - 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, - 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, - 0x6e, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x5b, 0x75, 0x73, 0x65, 0x72, 0x2c, - 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, 0x6b, - 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x7d, 0x3e, 0x3c, 0x73, - 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x7d, 0x3a, - 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x20, 0x3c, 0x24, - 0x7b, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, - 0x7d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x6d, 0x73, 0x67, 0x29, 0x7d, 0x20, - 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x68, - 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x63, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, - 0x61, 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x29, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, - 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x60, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, - 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x28, - 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, - 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, - 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, + 0x3d, 0x24, 0x7b, 0x73, 0x74, 0x6f, 0x70, 0x7d, 0x20, 0x64, 0x69, 0x73, + 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, + 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x7d, 0x3e, 0x53, 0x74, 0x6f, 0x70, + 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, + 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, + 0x6b, 0x3d, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, + 0x65, 0x73, 0x65, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3d, 0x20, 0x28, 0x70, + 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, + 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, + 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, + 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, + 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, + 0x20, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x62, + 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20, 0x28, 0x69, 0x66, 0x20, 0x6e, 0x65, + 0x65, 0x64, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, + 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20, + 0x26, 0x26, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, + 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, + 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x3c, 0x3d, + 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, + 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, + 0x6c, 0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, + 0x2e, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x48, 0x65, 0x69, 0x67, 0x68, + 0x74, 0x20, 0x2b, 0x20, 0x33, 0x30, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, + 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, 0x28, + 0x30, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, + 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, + 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x29, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x6d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x4c, + 0x69, 0x6e, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x5b, 0x75, 0x73, 0x65, 0x72, + 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, + 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x7d, 0x3e, 0x3c, + 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x7d, + 0x3a, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x20, 0x3c, + 0x24, 0x7b, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, + 0x68, 0x7d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x6d, 0x73, 0x67, 0x29, 0x7d, + 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, + 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, + 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, + 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x63, + 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, + 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c, 0x61, 0x74, + 0x4d, 0x61, 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, + 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x60, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, + 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, + 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, + 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, + 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, + 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, + 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, + 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, + 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x28, + 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, + 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65, + 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x3d, + 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, + 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, - 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, + 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65, - 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, - 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65, 0x6c, - 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x3d, 0x20, - 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, - 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, + 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x20, 0x3d, + 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, + 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, + 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x29, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, - 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x20, 0x3d, 0x20, - 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, - 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, - 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x70, - 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65, 0x6c, - 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x29, 0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x46, 0x6c, 0x6f, 0x61, 0x74, - 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, 0x20, 0x6d, 0x69, - 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x65, - 0x70, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x75, 0x65, 0x29, 0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x46, 0x6c, 0x6f, 0x61, + 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, 0x20, 0x6d, + 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, + 0x65, 0x70, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, + 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, + 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, + 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, + 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, + 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, + 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, + 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, + 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, + 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3d, + 0x22, 0x24, 0x7b, 0x73, 0x74, 0x65, 0x70, 0x7d, 0x22, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, + 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, + 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, + 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, - 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, - 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, + 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, + 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, + 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, + 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, + 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, + 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e, + 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, + 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, + 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, + 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e, + 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61, + 0x78, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, + 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, + 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, + 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, + 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, + 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, + 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, + 0x70, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, - 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, - 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, - 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, - 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, - 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3d, 0x22, - 0x24, 0x7b, 0x73, 0x74, 0x65, 0x70, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, - 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, - 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, - 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, - 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, + 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, + 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, + 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, + 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, + 0x70, 0x74, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, + 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, + 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, + 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, + 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, + 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, - 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, - 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, - 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, - 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, - 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, - 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, - 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, - 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, - 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, - 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, - 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, - 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, - 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, - 0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3c, 0x2f, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, - 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, - 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, - 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, - 0x74, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, - 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, + 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, - 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, - 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, - 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, + 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, + 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, + 0x65, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, + 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, + 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, - 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, - 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, - 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, 0x65, - 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, - 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, - 0x62, 0x6f, 0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, + 0x22, 0x62, 0x6f, 0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, - 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, - 0x22, 0x63, 0x68, 0x61, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x22, - 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, + 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, + 0x3d, 0x22, 0x63, 0x68, 0x61, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, + 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, + 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, + 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, + 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, + 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, + 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, + 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, + 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, + 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, + 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, + 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, + 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68, 0x61, 0x74, 0x20, + 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, + 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x68, 0x69, 0x73, + 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, + 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, + 0x31, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, + 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, + 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, - 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, - 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, - 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68, 0x61, 0x74, 0x20, 0x68, - 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, - 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x68, 0x69, 0x73, 0x74, - 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, - 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x31, - 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, - 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, - 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, - 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, - 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, - 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, 0x5f, 0x70, 0x72, - 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, - 0x74, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, - 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x3a, 0x20, 0x22, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, - 0x72, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, - 0x35, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, - 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x73, 0x74, - 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, - 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, - 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x65, 0x6e, 0x61, 0x6c, - 0x69, 0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x20, 0x73, - 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, - 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, - 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, - 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, - 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, - 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, - 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, + 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3a, 0x20, 0x22, 0x43, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, 0x72, - 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x66, 0x6f, - 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x22, 0x2c, - 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, - 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, - 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, - 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, - 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, + 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, + 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x2d, 0x31, + 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, 0x5f, 0x70, + 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, + 0x63, 0x74, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x4b, 0x20, 0x73, 0x61, - 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, - 0x3a, 0x20, 0x31, 0x30, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, - 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, - 0x6f, 0x70, 0x5f, 0x6b, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x7d, 0x29, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, - 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, - 0x70, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, - 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, - 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, - 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x22, - 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, - 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, - 0x70, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, - 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x4d, 0x6f, - 0x72, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x3c, 0x2f, - 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, - 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, + 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, + 0x2e, 0x35, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, + 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x65, 0x6d, + 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x73, + 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x65, + 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x46, 0x53, 0x2d, - 0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x65, 0x6e, 0x61, + 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x20, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x22, 0x2c, 0x20, 0x6d, + 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, + 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, + 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, + 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, + 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, + 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, + 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x43, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, + 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x66, + 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x22, + 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, + 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, + 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, + 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, + 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, + 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x4b, 0x20, 0x73, + 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, + 0x78, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, + 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, + 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x7d, 0x29, 0x7d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, + 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, + 0x6f, 0x70, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, + 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x66, 0x73, 0x5f, 0x7a, + 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, - 0x66, 0x73, 0x5f, 0x7a, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x79, 0x70, 0x69, - 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, - 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, - 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, - 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, - 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x69, 0x63, - 0x61, 0x6c, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x73, - 0x65, 0x6e, 0x63, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, - 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, - 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, - 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, - 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, - 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, - 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, - 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, - 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x46, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, - 0x63, 0x79, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, - 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, - 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3a, 0x20, 0x22, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, - 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, + 0x6f, 0x70, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, + 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x4d, + 0x6f, 0x72, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x3c, + 0x2f, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, + 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, + 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, + 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, + 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x46, 0x53, + 0x2d, 0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, + 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x66, 0x73, 0x5f, + 0x7a, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, + 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, + 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, + 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, + 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x79, 0x70, + 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, + 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, + 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, + 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x66, 0x72, 0x65, 0x71, - 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, - 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, - 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x72, 0x20, 0x2f, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, - 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x68, 0x72, 0x65, 0x65, 0x22, 0x3e, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x69, + 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, + 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, + 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, + 0x73, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, + 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, + 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, 0x72, 0x65, 0x73, 0x65, + 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, + 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, + 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, + 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, + 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, + 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x46, 0x72, 0x65, 0x71, 0x75, 0x65, + 0x6e, 0x63, 0x79, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, + 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, + 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, + 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, + 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x66, 0x72, 0x65, + 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, + 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, + 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x72, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, - 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, - 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, - 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x3d, 0x22, 0x30, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, - 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, - 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, - 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, - 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, - 0x3e, 0x20, 0x6e, 0x6f, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, - 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, + 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x68, 0x72, 0x65, 0x65, 0x22, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, + 0x75, 0x65, 0x3d, 0x22, 0x30, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x31, 0x7d, 0x20, 0x6f, 0x6e, 0x69, + 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, - 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, - 0x76, 0x31, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, + 0x2f, 0x3e, 0x20, 0x6e, 0x6f, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, + 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x32, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, + 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, - 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x7d, 0x20, 0x6f, 0x6e, + 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x31, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, - 0x20, 0x76, 0x32, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, + 0x20, 0x76, 0x31, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, - 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, - 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, - 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, - 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6d, - 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, - 0x5f, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, - 0x74, 0x61, 0x75, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, - 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, 0x73, - 0x74, 0x61, 0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x6d, 0x61, - 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, - 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, - 0x61, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, - 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, - 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, - 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, - 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, - 0x2f, 0x20, 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6e, 0x73, 0x20, - 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, 0x72, 0x65, 0x70, - 0x6c, 0x61, 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x61, 0x72, 0x6b, 0x64, - 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, - 0x64, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x74, - 0x65, 0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x26, 0x2f, - 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, 0x27, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, - 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, 0x2c, 0x20, 0x27, - 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, - 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, 0x74, 0x3b, 0x27, - 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, - 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23, 0x7b, 0x31, - 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x69, - 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31, 0x3c, 0x2f, - 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, - 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x5c, - 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, - 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, - 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x5f, - 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, - 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, - 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, - 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, - 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, - 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, + 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, + 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x32, 0x22, 0x20, 0x63, 0x68, 0x65, + 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, + 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x7d, 0x20, 0x6f, + 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, + 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, + 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, + 0x74, 0x20, 0x76, 0x32, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, + 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, + 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x74, 0x61, 0x75, 0x22, 0x2c, + 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2e, 0x30, 0x2c, 0x20, + 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, + 0x74, 0x5f, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, + 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, + 0x5f, 0x74, 0x61, 0x75, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, + 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, + 0x73, 0x74, 0x61, 0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x6d, + 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, + 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, + 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, + 0x74, 0x61, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, + 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, + 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, + 0x61, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, + 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x20, 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6e, 0x73, + 0x20, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, 0x72, 0x65, + 0x70, 0x6c, 0x61, 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x61, 0x72, 0x6b, + 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, 0x20, 0x28, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x6d, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, + 0x74, 0x65, 0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x26, + 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, 0x27, 0x29, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, + 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, 0x2c, 0x20, + 0x27, 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f, 0x67, 0x2c, - 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, - 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x60, - 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73, 0x5c, 0x53, - 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, - 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, - 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f, 0x70, 0x72, - 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x28, 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, 0x74, 0x3b, + 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23, 0x7b, + 0x31, 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, + 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31, 0x3c, + 0x2f, 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, + 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, + 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, + 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, + 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, + 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c, 0x20, + 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, + 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, + 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, + 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, + 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, + 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f, 0x67, + 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, + 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, - 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, - 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, - 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, - 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62, 0x72, 0x20, - 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, - 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, - 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, - 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x5f, 0x5f, - 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d, 0x7d, 0x20, - 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, - 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, - 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x60, 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73, 0x5c, + 0x53, 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c, 0x20, + 0x27, 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, + 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f, 0x70, + 0x72, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, + 0x60, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, + 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, + 0x64, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, + 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62, 0x72, + 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, - 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, - 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, - 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, - 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, - 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74, 0x6f, 0x46, - 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20, 0x70, 0x65, - 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24, 0x7b, 0x6c, + 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67, 0x65, + 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, + 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x5f, + 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d, 0x7d, + 0x20, 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, - 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, - 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32, 0x29, 0x7d, - 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, - 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, - 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, - 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, - 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x65, - 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, - 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61, - 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, - 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x6d, 0x61, - 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, - 0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, + 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, + 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, + 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, + 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, + 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, + 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74, 0x6f, + 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20, 0x70, + 0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24, 0x7b, + 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, + 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f, 0x6e, + 0x64, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32, 0x29, + 0x7d, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65, 0x72, + 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x70, + 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, + 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, + 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64, 0x65, 0x6c, - 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, - 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, + 0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, + 0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, + 0x65, 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, 0x3d, + 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20, 0x62, 0x79, - 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, - 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, - 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67, 0x61, 0x6e, - 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, - 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c, - 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61, 0x20, 0x68, - 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, - 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e, 0x67, 0x67, - 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e, 0x3c, 0x2f, - 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70, 0x29, 0x2c, - 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x62, 0x6f, - 0x64, 0x79, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63, 0x72, - 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x3e, - 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x3c, 0x2f, 0x62, - 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, 0x6d, 0x6c, - 0x3e, 0x0a + 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, + 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43, 0x68, + 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e, 0x66, + 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x6d, + 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, + 0x6e, 0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64, 0x65, + 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x6e, 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20, 0x62, + 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, + 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, + 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67, 0x61, + 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, + 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, + 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61, 0x20, + 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, + 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e, 0x67, + 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e, 0x3c, + 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, + 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70, 0x29, + 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x62, + 0x6f, 0x64, 0x79, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, + 0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x3c, 0x2f, + 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, 0x6d, + 0x6c, 0x3e, 0x0a }; -unsigned int index_html_len = 13790; +unsigned int index_html_len = 13791; diff --git a/examples/server/public/index.html b/examples/server/public/index.html index ea93de4aafc48..1db69a6bab024 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -3,12 +3,11 @@ + llama.cpp - chat - +
@@ -310,7 +310,7 @@ Model Choice:
-
Claude Compatibility Rename Fix
+
Claude Compatibility Rename Fix
From 468ea24fb4633a0d681f7ac84089566c1c6190cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 2 Aug 2023 18:04:04 +0200 Subject: [PATCH 43/70] CUDA: faster non k-quant mul_mat_q kernels (#2483) --- ggml-cuda.cu | 580 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 368 insertions(+), 212 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a4dd6bb9df997..e0192bc6ecebc 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1362,22 +1362,185 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __ } // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called +// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q -#define VDR_q4_0_q8_1 1 +#define VDR_Q4_0_Q8_1_MMVQ 2 +#define VDR_Q4_0_Q8_1_MMQ 4 -static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( - const int & vi, const int & ui0, const int & ui1, const half & d4, const half2 & ds8) { +template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( + const int * v, const int * u, const float & d4, const half2 & ds8) { #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - // subtract 8 from each quantized value - const int vi0 = (vi >> 0) & 0x0F0F0F0F; - const int vi1 = (vi >> 4) & 0x0F0F0F0F; + int sumi = 0; - // SIMD dot product of quantized values - int sumi = __dp4a(vi0, ui0, 0); - sumi = __dp4a(vi1, ui1, sumi); +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y)); +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_1_Q8_1_MMVQ 2 +#define VDR_Q4_1_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( + const int * v, const int * u, const half2 & dm4, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + +#ifdef GGML_CUDA_F16 + const half2 tmp = __hmul2(dm4, ds8); + const float d4d8 = __half2float(tmp.x); + const float m4s8 = __half2float(tmp.y); +#else + const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x); + const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y); +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_0_Q8_1_MMVQ 2 +#define VDR_Q5_0_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( + const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + + // second part effectively subtracts 16 from each quant value + return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y)); +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_1_Q8_1_MMVQ 2 +#define VDR_Q5_1_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( + const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + +#ifdef GGML_CUDA_F16 + const half2 tmp = __hmul2(dm5, ds8); + const float d5d8 = __half2float(tmp.x); + const float m5s8 = __half2float(tmp.y); +#else + const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x); + const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y); +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it + return sumi*d5d8 + m5s8 / (QI5_1 / vdr); - return __half2float(d4) * (sumi * __half2float(ds8.x) - (8/QI4_0) * __half2float(ds8.y)); +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q8_0_Q8_1_MMVQ 2 +#define VDR_Q8_0_Q8_1_MMQ 8 + +template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( + const int * v, const int * u, const float & d8_0, const half2 & ds8_1) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + + return sumi * d8_0 * __half2float(ds8_1.x); +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( + const int * v, const int * u, const half2 & dm8, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + +#ifdef GGML_CUDA_F16 + const half2 tmp = __hmul2(dm8, ds8); + const float d8d8 = __half2float(tmp.x); + const float m8s8 = __half2float(tmp.y); +#else + const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x); + const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y); +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it + return sumi*d8d8 + m8s8 / (QI8_1 / vdr); #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1388,20 +1551,26 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; - const int vi = get_int_from_uint8(bq4_0->qs, iqs); - const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs); - const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI4_0); + int v[VDR_Q4_0_Q8_1_MMVQ]; + int u[2*VDR_Q4_0_Q8_1_MMVQ]; - return vec_dot_q4_0_q8_1_impl(vi, ui0, ui1, bq4_0->d, bq8_1->ds); +#pragma unroll + for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + } + + return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); } static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0]; + __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0]; *x_ql = tile_x_qs; - *x_dm = tile_x_d; + *x_dm = (half2 *) tile_x_d; } template static __device__ __forceinline__ void load_tiles_q4_0( @@ -1418,6 +1587,8 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_0 * bx0 = (block_q4_0 *) vx; + float * x_dmf = (float *) x_dm; + #pragma unroll for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { int i = i0 + i_offset; @@ -1429,7 +1600,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx].x = bxi->d; + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; } // const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; @@ -1462,39 +1633,19 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( __builtin_assume(k < WARP_SIZE); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const float * x_dmf = (float *) x_dm; - return vec_dot_q4_0_q8_1_impl( - x_ql[i * (WARP_SIZE + 1) + k], y_qs[j * (2*WARP_SIZE) + kyqs], y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], - x_dm[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0].x, y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); -} - -#define VDR_q4_1_q8_1 1 - -static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( - const int & vi, const int & ui0, const int & ui1, const half2 & dm4, const half2 & ds8) { + int u[2*VDR_Q4_0_Q8_1_MMQ]; -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const int vi0 = (vi >> 0) & 0x0F0F0F0F; - const int vi1 = (vi >> 4) & 0x0F0F0F0F; - - // SIMD dot product of quantized values - int sumi = __dp4a(vi0, ui0, 0); - sumi = __dp4a(vi1, ui1, sumi); - -#ifdef GGML_CUDA_F16 - const half2 tmp = __hmul2(dm4, ds8); - const float d4d8 = __half2float(tmp.x); - const float m4s8 = __half2float(tmp.y); -#else - const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x); - const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y); -#endif // GGML_CUDA_F16 +#pragma unroll + for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; + u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0]; + } - // scale second part of sum by QI8_1/QR4_1 to compensate for multiple threads adding it - return sumi * d4d8 + m4s8 / (QI8_1 / QR4_1); -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + return vec_dot_q4_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); } static __device__ __forceinline__ float vec_dot_q4_1_q8_1( @@ -1502,11 +1653,17 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; - const int vi = get_int_from_uint8_aligned(bq4_1->qs, iqs); - const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs); - const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI4_1); + int v[VDR_Q4_1_Q8_1_MMVQ]; + int u[2*VDR_Q4_1_Q8_1_MMVQ]; - return vec_dot_q4_1_q8_1_impl(vi, ui0, ui1, bq4_1->dm, bq8_1->ds); +#pragma unroll + for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + } + + return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); } static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { @@ -1575,35 +1732,17 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - return vec_dot_q4_1_q8_1_impl( - x_ql[i * (WARP_SIZE + 1) + k], y_qs[j * (2*WARP_SIZE) + kyqs], y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], - x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); -} - -#define VDR_q5_0_q8_1 1 + int u[2*VDR_Q4_1_Q8_1_MMQ]; -static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( - const int & qs, const int & qh, const int & ui0, const int & ui1, const half & d5, const half2 & ds8) { +#pragma unroll + for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; + u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1]; + } -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits - vi0 |= (qh << 4) & 0x00000010; // 0 -> 4 - vi0 |= (qh << 11) & 0x00001000; // 1 -> 12 - vi0 |= (qh << 18) & 0x00100000; // 2 -> 20 - vi0 |= (qh << 25) & 0x10000000; // 3 -> 28 - int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values - - int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits - vi1 |= (qh >> 12) & 0x00000010; // 16 -> 4 - vi1 |= (qh >> 5) & 0x00001000; // 17 -> 12 - vi1 |= (qh << 2) & 0x00100000; // 18 -> 20 - vi1 |= (qh << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values - - return __half2float(d5) * (sumi*__half2float(ds8.x) - (16/QI5_0) * __half2float(ds8.y)); -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + return vec_dot_q4_1_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_0_q8_1( @@ -1611,23 +1750,28 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; - const int qs = get_int_from_uint8(bq5_0->qs, iqs); - const int qh = get_int_from_uint8(bq5_0->qh, 0) >> (4 * iqs); - const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs); - const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI5_0); + int vl[VDR_Q5_0_Q8_1_MMVQ]; + int vh[VDR_Q5_0_Q8_1_MMVQ]; + int u[2*VDR_Q5_0_Q8_1_MMVQ]; - return vec_dot_q5_0_q8_1_impl(qs, qh, ui0, ui1, bq5_0->d, bq8_1->ds); +#pragma unroll + for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); + vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + } + + return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); } static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; - __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; *x_ql = tile_x_ql; - *x_qh = tile_x_qh; - *x_dm = tile_x_d; + *x_dm = (half2 *) tile_x_d; } template static __device__ __forceinline__ void load_tiles_q5_0( @@ -1654,11 +1798,31 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + const int ql = get_int_from_uint8(bxi->qs, kqsx); + const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + qs0 = __vsubss4(qs0, 0x10101010); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + qs1 = __vsubss4(qs1, 0x10101010); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; } const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; #pragma unroll for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) { @@ -1670,8 +1834,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_qh[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = get_int_from_uint8(bxi->qh, 0); - x_dm[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd].x = bxi->d; + x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; } } @@ -1688,46 +1851,18 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const float * x_dmf = (float *) x_dm; - return vec_dot_q5_0_q8_1_impl( - x_ql[i * (WARP_SIZE + 1) + k], x_qh[index_bx] >> (4 * (k % QI5_0)), y_qs[j * (2*WARP_SIZE) + kyqs], - y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], x_dm[index_bx].x, y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); -} - -#define VDR_q5_1_q8_1 1 - -static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( - const int & qs, const int & qh, const int & ui0, const int & ui1, const half2 & dm5, const half2 & ds8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits - vi0 |= (qh << 4) & 0x00000010; // 0 -> 4 - vi0 |= (qh << 11) & 0x00001000; // 1 -> 12 - vi0 |= (qh << 18) & 0x00100000; // 2 -> 20 - vi0 |= (qh << 25) & 0x10000000; // 3 -> 28 - int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values - - int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits - vi1 |= (qh >> 12) & 0x00000010; // 16 -> 4 - vi1 |= (qh >> 5) & 0x00001000; // 17 -> 12 - vi1 |= (qh << 2) & 0x00100000; // 18 -> 20 - vi1 |= (qh << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values - -#ifdef GGML_CUDA_F16 - const half2 tmp = __hmul2(dm5, ds8); - const float d5d8 = __half2float(tmp.x); - const float m5s8 = __half2float(tmp.y); -#else - const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x); - const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y); -#endif // GGML_CUDA_F16 + int u[2*VDR_Q5_0_Q8_1_MMQ]; - return sumi*d5d8 + m5s8/QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block +#pragma unroll + for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; + u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0]; + } -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( @@ -1735,22 +1870,27 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; - const int qs = get_int_from_uint8_aligned(bq5_1->qs, iqs); - const int qh = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * iqs); - const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs); - const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI5_1); + int vl[VDR_Q5_1_Q8_1_MMVQ]; + int vh[VDR_Q5_1_Q8_1_MMVQ]; + int u[2*VDR_Q5_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); + vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + } - return vec_dot_q5_1_q8_1_impl(qs, qh, ui0, ui1, bq5_1->dm, bq8_1->ds); + return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); } static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE ) + GGML_CUDA_MMQ_Y]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1]; *x_ql = tile_x_ql; - *x_qh = tile_x_qh; *x_dm = tile_x_dm; } @@ -1778,7 +1918,24 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; } const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; @@ -1794,7 +1951,6 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; - x_qh[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = get_int_from_uint8_aligned(bxi->qh, 0); x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; } } @@ -1813,24 +1969,16 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; - return vec_dot_q5_1_q8_1_impl( - x_ql[i * (WARP_SIZE + 1) + k], x_qh[index_bx] >> (4 * (k % QI5_1)), y_qs[j * (2*WARP_SIZE) + kyqs], - y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); -} - -#define VDR_q8_0_q8_1 1 + int u[2*VDR_Q5_1_Q8_1_MMQ]; -static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( - const int & vi, const int & ui, const half & d8_0, const half2 & ds8_1) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - // SIMD dot product of quantized values - const int sumi = __dp4a(vi, ui, 0); +#pragma unroll + for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; + u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1]; + } - return sumi * __half2float(d8_0) * __half2float(ds8_1.x); -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + return vec_dot_q8_1_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); } static __device__ __forceinline__ float vec_dot_q8_0_q8_1( @@ -1838,19 +1986,24 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; - const int vi = get_int_from_int8(bq8_0->qs, iqs); - const int ui = get_int_from_int8_aligned(bq8_1->qs, iqs); + int v[VDR_Q8_0_Q8_1_MMVQ]; + int u[VDR_Q8_0_Q8_1_MMVQ]; - return vec_dot_q8_0_q8_1_impl(vi, ui, bq8_0->d, bq8_1->ds); + for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_int8(bq8_0->qs, iqs + i); + u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + } + + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds); } static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0]; + __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0]; *x_ql = tile_x_qs; - *x_dm = tile_x_d; + *x_dm = (half2 *) tile_x_d; } template static __device__ __forceinline__ void load_tiles_q8_0( @@ -1864,6 +2017,7 @@ template static __device__ __forceinline__ void load_tiles_q8_ const int kbx = k / QI8_0; const int kqsx = k % QI8_0; + float * x_dmf = (float *) x_dm; const block_q8_0 * bx0 = (block_q8_0 *) vx; @@ -1878,7 +2032,7 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); - x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx].x = bxi->d; + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d; } // const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; @@ -1912,9 +2066,11 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); - return vec_dot_q8_0_q8_1_impl( - x_ql[i * (WARP_SIZE + 1) + k], y_qs[j*WARP_SIZE + k], - x_dm[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0].x, y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]); + const float * x_dmf = (float *) x_dm; + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], + y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]); } #define VDR_q2_K_q8_1 1 @@ -2288,15 +2444,15 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( int u[2*QR4_K]; float d8[QR4_K]; - // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6 - const int bq8_offset = QR4_K * (iqs / (QI8_1/2)); + // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 - const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4)); + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); v[0] = q4[0]; v[1] = q4[4]; @@ -2317,7 +2473,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; d8[i] = bq8i->ds.x; - const int * q8 = (const int *)bq8i->qs + (iqs%4); + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; u[2*i+1] = q8[4]; } @@ -2345,12 +2501,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const float d8_1 = bq8_1[0].ds.x; const float d8_2 = bq8_1[1].ds.x; - const int ui1 = *((const int *)bq8_1[0].qs + iqs); - const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4); - const int ui3 = *((const int *)bq8_1[1].qs + iqs); - const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4); + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - const int * q4 = (const int *)bq4_K->qs + iqs; + const int * q4 = (const int *)bq4_K->qs + (iqs/2); const int v1 = q4[0]; const int v2 = q4[4]; @@ -2457,11 +2613,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( int u[2*QR4_K]; float d8[QR4_K]; - // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6 - const int bq8_offset = QR4_K * (kqsx / (QI8_1/2)); + // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2)); - v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 0]; - v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 4]; + v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0]; + v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4]; const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4]; uint16_t aux[2]; @@ -2477,7 +2633,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const uint8_t * m = sc + 2; for (int l = 0; l < QR4_K; ++l) { - const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + kqsx % (QI8_1/2); + const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2); u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)]; u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)]; d8[l] = y_ds[kqsy / QI8_1].x; @@ -2532,9 +2688,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( int u[2*QR5_K]; float d8[QR5_K]; - const int bq8_offset = QR5_K * (iqs / (QI8_1/2)); - const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4)); - const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4)); + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); vl[0] = ql[0]; vl[1] = ql[4]; @@ -2559,7 +2715,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; d8[i] = bq8i->ds.x; - const int * q8 = (const int *)bq8i->qs + (iqs%4); + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; u[2*i+1] = q8[4]; } @@ -2578,17 +2734,17 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const float d8_1 = bq8_1[0].ds.x; const float d8_2 = bq8_1[1].ds.x; - const int ui1 = *((const int *)bq8_1[0].qs + iqs); - const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4); - const int ui3 = *((const int *)bq8_1[1].qs + iqs); - const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4); + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - const int * ql = (const int *)bq5_K->qs + iqs; + const int * ql = (const int *)bq5_K->qs + (iqs/2); const int vl1 = ql[0]; const int vl2 = ql[4]; - const int step = 4 * iqs; // 0, 4, 8, 12 - const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3 + const int step = 4 * (iqs/2); // 0, 4, 8, 12 + const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 const int in = step%8; // 0, 4, 0, 4 const int vh = (*((const int *)(bq5_K->qh + in))) >> im; @@ -2711,13 +2867,13 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( int u[2*QR4_K]; float d8[QR4_K]; - const int bq8_offset = QR5_K * (kqsx / (QI8_1/2)); + const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2)); - vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 0]; - vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + kqsx % 4 + 4]; + vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0]; + vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4]; - vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + kqsx % 4 + 0] >> bq8_offset; - vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + kqsx % 4 + 4] >> bq8_offset; + vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset; + vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset; const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4]; uint16_t aux[2]; @@ -2733,7 +2889,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const uint8_t * m = sc + 2; for (int l = 0; l < QR5_K; ++l) { - const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + kqsx % (QI8_1/2); + const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2); u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)]; u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)]; d8[l] = y_ds[kqsy / QI8_1].x; @@ -2982,7 +3138,7 @@ static __global__ void mul_mat_q( #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal #pragma unroll #endif // __CUDA_ARCH__ >= 700 - for (int k = 0; k < WARP_SIZE/vdr; ++k) { + for (int k = 0; k < WARP_SIZE; k += vdr) { #pragma unroll for (int j = 0; j < WARP_SIZE; j += 8) { #pragma unroll @@ -3034,9 +3190,9 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index - const int iby = (i + threadIdx.x / (qi/vdr)) * qk/QK8_1; // y block index that aligns with ibx + const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx - const int iqs = threadIdx.x % (qi/vdr); // x block quant index when casting the quants to int + const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); } @@ -3579,7 +3735,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3588,7 +3744,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3597,7 +3753,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3606,7 +3762,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3615,7 +3771,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3717,10 +3873,10 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q4_0_q8_1, vec_dot_q4_0_q8_1_mul_mat> + mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q4_0_q8_1, vec_dot_q4_0_q8_1_mul_mat> + mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3735,10 +3891,10 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q4_1_q8_1, vec_dot_q4_1_q8_1_mul_mat> + mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q4_1_q8_1, vec_dot_q4_1_q8_1_mul_mat> + mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3753,10 +3909,10 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q5_0_q8_1, vec_dot_q5_0_q8_1_mul_mat> + mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q5_0_q8_1, vec_dot_q5_0_q8_1_mul_mat> + mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3771,10 +3927,10 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q5_1_q8_1, vec_dot_q5_1_q8_1_mul_mat> + mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q5_1_q8_1, vec_dot_q5_1_q8_1_mul_mat> + mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3789,10 +3945,10 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q8_0_q8_1, vec_dot_q8_0_q8_1_mul_mat> + mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q8_0_q8_1, vec_dot_q8_0_q8_1_mul_mat> + mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } From 9281c2801fd0386c037a4499db30bc9ac8a52296 Mon Sep 17 00:00:00 2001 From: duncannah <38371914+duncannah@users.noreply.github.com> Date: Wed, 2 Aug 2023 23:42:43 +0200 Subject: [PATCH 44/70] fix: don't send headers twice when streaming --- koboldcpp.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 1ba082d92d7ce..04e171acea819 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -572,12 +572,14 @@ def do_POST(self): newprompt = fullprompt gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag)) - try: - self.send_response(200) - self.end_headers() - self.wfile.write(json.dumps(gen).encode()) - except: - print("Generate: The response could not be sent, maybe connection was terminated?") + + if not kai_sse_stream_flag: + try: + self.send_response(200) + self.end_headers() + self.wfile.write(json.dumps(gen).encode()) + except: + print("Generate: The response could not be sent, maybe connection was terminated?") return finally: From 8183159cf3def112f6d1fe94815fce70e1bffa12 Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Wed, 2 Aug 2023 22:05:44 -0400 Subject: [PATCH 45/70] examples : generate JSON according to schema (#1887) * examples : add JSON schema grammars * complete JSON grammar * ensure primitive types can be used as root of schema * support integer type and adjust usage text --- examples/json-schema-to-grammar.py | 132 +++++++++++++++++++++++++++++ grammars/json.gbnf | 16 ++-- 2 files changed, 138 insertions(+), 10 deletions(-) create mode 100644 examples/json-schema-to-grammar.py diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py new file mode 100644 index 0000000000000..2dccc118a70e8 --- /dev/null +++ b/examples/json-schema-to-grammar.py @@ -0,0 +1,132 @@ +import argparse +import json +import re +import sys + +# whitespace is constrained to a single space char to prevent model "running away" in +# whitespace. Also maybe improves generation quality? +SPACE_RULE = '" "?' + +PRIMITIVE_RULES = { + 'boolean': '("true" | "false") space', + 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', + 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space', + 'string': r''' "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) + )* "\"" space ''', + 'null': '"null" space', +} + +INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+') +GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]') +GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'} + + +class SchemaConverter: + def __init__(self, prop_order): + self._prop_order = prop_order + self._rules = {'space': SPACE_RULE} + + def _format_literal(self, literal): + escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( + lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal) + ) + return f'"{escaped}"' + + def _add_rule(self, name, rule): + esc_name = INVALID_RULE_CHARS_RE.sub('-', name) + if esc_name not in self._rules or self._rules[esc_name] == rule: + key = esc_name + else: + i = 0 + while f'{esc_name}{i}' in self._rules: + i += 1 + key = f'{esc_name}{i}' + self._rules[key] = rule + return key + + def visit(self, schema, name): + schema_type = schema.get('type') + rule_name = name or 'root' + + if 'oneOf' in schema or 'anyOf' in schema: + rule = ' | '.join(( + self.visit(alt_schema, f'{name}{"-" if name else ""}{i}') + for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf']) + )) + return self._add_rule(rule_name, rule) + + elif 'const' in schema: + return self._add_rule(rule_name, self._format_literal(schema['const'])) + + elif 'enum' in schema: + rule = ' | '.join((self._format_literal(v) for v in schema['enum'])) + return self._add_rule(rule_name, rule) + + elif schema_type == 'object' and 'properties' in schema: + # TODO: `required` keyword + prop_order = self._prop_order + prop_pairs = sorted( + schema['properties'].items(), + # sort by position in prop_order (if specified) then by key + key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]), + ) + + rule = '"{" space' + for i, (prop_name, prop_schema) in enumerate(prop_pairs): + prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}') + if i > 0: + rule += ' "," space' + rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}' + rule += ' "}" space' + + return self._add_rule(rule_name, rule) + + elif schema_type == 'array' and 'items' in schema: + # TODO `prefixItems` keyword + item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item') + rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' + return self._add_rule(rule_name, rule) + + else: + assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' + return self._add_rule( + 'root' if rule_name == 'root' else schema_type, + PRIMITIVE_RULES[schema_type] + ) + + def format_grammar(self): + return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items())) + + +def main(args_in = None): + parser = argparse.ArgumentParser( + description=''' + Generates a grammar (suitable for use in ./main) that produces JSON conforming to a + given JSON schema. Only a subset of JSON schema features are supported; more may be + added in the future. + ''', + ) + parser.add_argument( + '--prop-order', + default=[], + type=lambda s: s.split(','), + help=''' + comma-separated property names defining the order of precedence for object properties; + properties not specified here are given lower precedence than those that are, and are + sorted alphabetically + ''' + ) + parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)') + args = parser.parse_args(args_in) + + schema = json.load(sys.stdin if args.schema == '-' else open(args.schema)) + prop_order = {name: idx for idx, name in enumerate(args.prop_order)} + converter = SchemaConverter(prop_order) + converter.visit(schema, '') + print(converter.format_grammar()) + + +if __name__ == '__main__': + main() diff --git a/grammars/json.gbnf b/grammars/json.gbnf index 40fa2b6373255..a9537cdf9fbe4 100644 --- a/grammars/json.gbnf +++ b/grammars/json.gbnf @@ -1,29 +1,25 @@ -# Grammar for subset of JSON - doesn't support full string or number syntax - -root ::= object -value ::= object | array | string | number | boolean | "null" +root ::= object +value ::= object | array | string | number | ("true" | "false" | "null") ws object ::= "{" ws ( string ":" ws value ("," ws string ":" ws value)* - )? "}" + )? "}" ws array ::= "[" ws ( value ("," ws value)* - )? "]" + )? "]" ws -string ::= +string ::= "\"" ( [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes )* "\"" ws -# Only plain integers currently -number ::= "-"? [0-9]+ ws -boolean ::= ("true" | "false") ws +number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws # Optional space: by convention, applied in this grammar after literal chars when allowed ws ::= ([ \t\n] ws)? From 3fa6befdaf5a97220ac9a59fe2f8217f0222420d Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 3 Aug 2023 10:50:16 +0800 Subject: [PATCH 46/70] increase max free blocks --- ggml-alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 5e1be61ff6cef..129af8a97ca6a 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -56,7 +56,7 @@ struct free_block { size_t size; }; -#define MAX_FREE_BLOCKS 128 +#define MAX_FREE_BLOCKS 256 struct ggml_allocr { void * data; From ba2040d1df9ca5fbcea3b4d8ad12a637b5ed1f99 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 3 Aug 2023 12:52:06 +0800 Subject: [PATCH 47/70] compile fix for ARM NEON --- koboldcpp.py | 6 ++-- otherarch/ggml_v1.c | 86 --------------------------------------------- 2 files changed, 3 insertions(+), 89 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 1ba082d92d7ce..131ae2122220c 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -704,7 +704,7 @@ def show_new_gui(): (lib_default, "Use No BLAS"), (lib_noavx2, "NoAVX2 Mode (Old CPU)"), (lib_failsafe, "Failsafe Mode (Old CPU)")] - openblas_option, clblast_option, cublas_option, default_option, openblas_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) + openblas_option, clblast_option, cublas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) # slider data blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024"] blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024"] @@ -1151,8 +1151,8 @@ def import_vars(dict): if failsafe_option is not None: runopts_var.set(failsafe_option) elif "noavx2" in dict and dict["noavx2"]: - if openblas_noavx2_option is not None: - runopts_var.set(openblas_noavx2_option) + if noavx2_option is not None: + runopts_var.set(noavx2_option) elif "noblas" in dict and dict["noblas"]: if default_option is not None: runopts_var.set(default_option) diff --git a/otherarch/ggml_v1.c b/otherarch/ggml_v1.c index 04e981a2625d1..bfa0fe7099d15 100644 --- a/otherarch/ggml_v1.c +++ b/otherarch/ggml_v1.c @@ -1577,63 +1577,6 @@ inline static void ggml_v1_vec_mad_q4_0(const int n, float * restrict y, void * const float * restrict pd = (const float *) (x); const uint8_t * restrict pb = (const uint8_t *) (pd + nb); -#if __ARM_NEON -#if QK == 32 - for (int i = 0; i < nb; ++i) { - const float d0 = pd[i]*v; - - const uint8_t * restrict pp = pb + i*16; - - const uint8x8_t m4b = vdup_n_u8(0xf); - const int8x8_t s8b = vdup_n_s8(0x8); - - const float32x4_t vd = vdupq_n_f32(d0); - - for (int j = 0; j < 2; j++) { - const uint8x8_t vx = vld1_u8(pp + j*8); - - const int8x8_t vxl = vreinterpret_s8_u8(vand_u8(vx, m4b)); - const int8x8_t vxh = vreinterpret_s8_u8(vshr_n_u8(vx, 4)); - - // sub 8 - const int8x8_t vxls = vsub_s8(vxl, s8b); - const int8x8_t vxhs = vsub_s8(vxh, s8b); - - //const int8x8_t vxlt = vzip_s8(vxls, vxhs)[0]; - //const int8x8_t vxht = vzip_s8(vxls, vxhs)[1]; - const int8x8_t vxlt = vzip1_s8(vxls, vxhs); - const int8x8_t vxht = vzip2_s8(vxls, vxhs); - - const int8x16_t vxq = vcombine_s8(vxlt, vxht); - - // convert to 2x int16x8_t - const int16x8_t vxq0 = vmovl_s8(vget_low_s8 (vxq)); - const int16x8_t vxq1 = vmovl_s8(vget_high_s8(vxq)); - - // convert to 4x float32x4_t - const float32x4_t vx0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq0))); - const float32x4_t vx1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq0))); - const float32x4_t vx2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq1))); - const float32x4_t vx3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq1))); - - const float32x4_t vy0 = vld1q_f32(y + i*32 + j*16 + 0); - const float32x4_t vy1 = vld1q_f32(y + i*32 + j*16 + 4); - const float32x4_t vy2 = vld1q_f32(y + i*32 + j*16 + 8); - const float32x4_t vy3 = vld1q_f32(y + i*32 + j*16 + 12); - - const float32x4_t vr0 = vfmaq_f32(vy0, vx0, vd); - const float32x4_t vr1 = vfmaq_f32(vy1, vx1, vd); - const float32x4_t vr2 = vfmaq_f32(vy2, vx2, vd); - const float32x4_t vr3 = vfmaq_f32(vy3, vx3, vd); - - vst1q_f32(y + i*32 + j*16 + 0, vr0); - vst1q_f32(y + i*32 + j*16 + 4, vr1); - vst1q_f32(y + i*32 + j*16 + 8, vr2); - vst1q_f32(y + i*32 + j*16 + 12, vr3); - } - } -#endif -#else // scalar for (int i = 0; i < nb; i++) { const float d = pd[i]; @@ -1658,7 +1601,6 @@ inline static void ggml_v1_vec_mad_q4_0(const int n, float * restrict y, void * assert(!isinf(y[i*QK + l + 1])); } } -#endif } inline static void ggml_v1_vec_mad_q4_1(const int n, float * restrict y, void * restrict x, const float v) { @@ -10199,31 +10141,3 @@ int ggml_v1_cpu_has_vsx(void) { } //////////////////////////////////////////////////////////////////////////////// - - -#if __ARM_NEON -#if !defined(__aarch64__) -int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { - int8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) { - int8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -#endif -#endif \ No newline at end of file From 63ec711a70215c788448c6af9f66c09240577d0b Mon Sep 17 00:00:00 2001 From: duncannah <38371914+duncannah@users.noreply.github.com> Date: Thu, 3 Aug 2023 14:35:43 +0200 Subject: [PATCH 48/70] fix: still send full result after streaming --- koboldcpp.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 04e171acea819..f9526531e5577 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -573,11 +573,13 @@ def do_POST(self): gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag)) - if not kai_sse_stream_flag: - try: + try: + # Headers are already sent when streaming + if not kai_sse_stream_flag: self.send_response(200) self.end_headers() - self.wfile.write(json.dumps(gen).encode()) + + self.wfile.write(json.dumps(gen).encode()) except: print("Generate: The response could not be sent, maybe connection was terminated?") From f0764c6cfb1efa6f4f5d7b0f57b3a9a86d4ab798 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 4 Aug 2023 10:29:56 +0800 Subject: [PATCH 49/70] fix indentation, increase server thread count --- koboldcpp.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 4918ab6ecd4cf..59b6d86e390bb 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -578,10 +578,9 @@ def do_POST(self): if not kai_sse_stream_flag: self.send_response(200) self.end_headers() - self.wfile.write(json.dumps(gen).encode()) - except: - print("Generate: The response could not be sent, maybe connection was terminated?") + except: + print("Generate: The response could not be sent, maybe connection was terminated?") return finally: @@ -647,7 +646,7 @@ def stop(self): exitcounter = 999 self.httpd.server_close() - numThreads = 6 + numThreads = 8 threadArr = [] for i in range(numThreads): threadArr.append(Thread(i)) From ff966e7ca6af127c9405523cdb07ef8fa01bf6d6 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Fri, 4 Aug 2023 13:07:21 +0300 Subject: [PATCH 50/70] build : fix several cast and printf warnings (#2499) --- examples/embd-input/embd-input-lib.cpp | 2 +- examples/grammar-parser.cpp | 2 +- examples/perplexity/perplexity.cpp | 8 ++++---- examples/simple/simple.cpp | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 26563821a1078..2185b9b0e2839 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -30,7 +30,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) { fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); + params.seed = uint32_t(time(NULL)); } fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); diff --git a/examples/grammar-parser.cpp b/examples/grammar-parser.cpp index 019d5e1bf9ceb..e76bd11c31598 100644 --- a/examples/grammar-parser.cpp +++ b/examples/grammar-parser.cpp @@ -405,7 +405,7 @@ namespace grammar_parser { for (size_t i = 0, end = state.rules.size(); i < end; i++) { // fprintf(file, "%zu: ", i); // print_rule_binary(file, state.rules[i]); - print_rule(file, i, state.rules[i], symbol_id_names); + print_rule(file, uint32_t(i), state.rules[i], symbol_id_names); // fprintf(file, "\n"); } } catch (const std::exception & err) { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 6870a11b931db..62433e983df9c 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -153,7 +153,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { } size_t hs_task_count = prompt_lines.size()/6; - fprintf(stderr, "%s : loaded %lu tasks from prompt.\n", __func__, hs_task_count); + fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); // This is needed as usual for LLaMA models bool prepend_bos = true; @@ -178,7 +178,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { double ending_logprob[4]; }; - fprintf(stderr, "%s : selecting %lu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); + fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); // Select and read data from prompt lines hs_data_t *hs_data = new hs_data_t[hs_task_count]; @@ -223,7 +223,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { // Stop if query wont fit the ctx window if (query_size > (size_t)params.n_ctx) { - fprintf(stderr, "%s : number of tokens in query %lu > n_ctxl\n", __func__, query_size); + fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size); return; } @@ -284,7 +284,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { } // Print the accumulated accuracy mean x 100 - printf("%li\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0); + printf("%zu\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0); fflush(stdout); } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index aa2c4352df294..97137a6584aa3 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -123,7 +123,7 @@ int main(int argc, char ** argv) // Evaluate the tokens : //--------------------------------- - if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) + if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) { fprintf( stderr, "%s : failed to eval\n" , __func__ ); return 1; From 415e99fec27be5a2e4283f1937afd17eb33fbd66 Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Fri, 4 Aug 2023 19:29:52 +0800 Subject: [PATCH 51/70] Stream save llama context data to file instead of allocating entire buffer upfront (#2488) * added stream saving context data to file to avoid allocating unnecessary amounts of memory * generalised copying state data to file or buffer * added comments explaining how copy_state_data works * fixed trailing whitespaces * fixed save load state example * updated save load state to use public function in llama.cpp * - restored breakage of the llama_copy_state_data API - moved new logic for copying llama state data to internal function * fixed function declaration order * restored save load state example * fixed whitepace * removed unused llama-util.h include * Apply suggestions from code review Co-authored-by: slaren * Apply code review suggestions Co-authored-by: slaren --------- Co-authored-by: slaren --- llama-util.h | 40 ++++++++++++++++++++++++++ llama.cpp | 79 ++++++++++++++++++++++++++++++---------------------- 2 files changed, 86 insertions(+), 33 deletions(-) diff --git a/llama-util.h b/llama-util.h index 042ebe43c48e1..3fc03ce28273e 100644 --- a/llama-util.h +++ b/llama-util.h @@ -149,6 +149,46 @@ struct llama_file { } }; +// llama_context_data +struct llama_data_context { + virtual void write(const void * src, size_t size) = 0; + virtual size_t get_size_written() = 0; + virtual ~llama_data_context() = default; +}; + +struct llama_data_buffer_context : llama_data_context { + uint8_t* ptr; + size_t size_written = 0; + + llama_data_buffer_context(uint8_t * p) : ptr(p) {} + + void write(const void * src, size_t size) override { + memcpy(ptr, src, size); + ptr += size; + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_file_context : llama_data_context { + llama_file* file; + size_t size_written = 0; + + llama_data_file_context(llama_file * f) : file(f) {} + + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + #if defined(_WIN32) static std::string llama_format_win_err(DWORD err) { LPSTR buf; diff --git a/llama.cpp b/llama.cpp index d427054dd8d6a..839739870eb3e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3743,10 +3743,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) { return s_total; } -// Copies the state to the specified destination address -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { - uint8_t * out = dst; - +/** copy state data into either a buffer or file depending on the passed in context + * + * file context: + * llama_file file("/path", "wb"); + * llama_data_file_context data_ctx(&file); + * llama_copy_state_data(ctx, &data_ctx); + * + * buffer context: + * std::vector buf(max_size, 0); + * llama_data_buffer_context data_ctx(&buf.data()); + * llama_copy_state_data(ctx, &data_ctx); + * +*/ +void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { // copy rng { std::stringstream rng_ss; @@ -3758,8 +3768,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE); memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); - memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size); - memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE; + data_ctx->write(&rng_size, sizeof(rng_size)); + data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE); } // copy logits @@ -3767,25 +3777,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { const size_t logits_cap = ctx->logits.capacity(); const size_t logits_size = ctx->logits.size(); - memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap); - memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size); + data_ctx->write(&logits_cap, sizeof(logits_cap)); + data_ctx->write(&logits_size, sizeof(logits_size)); if (logits_size) { - memcpy(out, ctx->logits.data(), logits_size * sizeof(float)); + data_ctx->write(ctx->logits.data(), logits_size * sizeof(float)); } - out += logits_cap * sizeof(float); + // If there is a gap between the size and the capacity, write padding + size_t padding_size = (logits_cap - logits_size) * sizeof(float); + if (padding_size > 0) { + std::vector padding(padding_size, 0); // Create a buffer filled with zeros + data_ctx->write(padding.data(), padding_size); + } } // copy embeddings { const size_t embedding_size = ctx->embedding.size(); - memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size); + data_ctx->write(&embedding_size, sizeof(embedding_size)); if (embedding_size) { - memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float)); - out += embedding_size * sizeof(float); + data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float)); } } @@ -3800,8 +3814,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { const size_t kv_size = kv_self.buf.size; const int kv_ntok = llama_get_kv_cache_token_count(ctx); - memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); - memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); + data_ctx->write(&kv_size, sizeof(kv_size)); + data_ctx->write(&kv_ntok, sizeof(kv_ntok)); if (kv_size) { const size_t elt_size = ggml_element_size(kv_self.k); @@ -3810,12 +3824,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_cgraph gf{}; ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); - kout3d->data = out; - out += ggml_nbytes(kout3d); + std::vector kout3d_data(ggml_nbytes(kout3d), 0); + kout3d->data = kout3d_data.data(); ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); - vout3d->data = out; - out += ggml_nbytes(vout3d); + std::vector vout3d_data(ggml_nbytes(vout3d), 0); + vout3d->data = vout3d_data.data(); ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, n_embd, kv_ntok, n_layer, @@ -3830,15 +3844,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); + + // our data is now in the kout3d_data and vout3d_data buffers + // write them to file + data_ctx->write(kout3d_data.data(), kout3d_data.size()); + data_ctx->write(vout3d_data.data(), vout3d_data.size()); } } +} - const size_t written = out - dst; - const size_t max_size = llama_get_state_size(ctx); - - LLAMA_ASSERT(written <= max_size); +size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { + llama_data_buffer_context data_ctx(dst); + llama_copy_state_data_internal(ctx, &data_ctx); - return written; + return data_ctx.get_size_written(); } // Sets the state reading from the specified source address @@ -4023,15 +4042,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi file.write_u32((uint32_t) n_token_count); file.write_raw(tokens, sizeof(llama_token) * n_token_count); - // save the context state - { - const size_t n_state_size_max = llama_get_state_size(ctx); - - std::vector state_data(n_state_size_max); - const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data()); - - file.write_raw(state_data.data(), n_state_size_cur); - } + // save the context state using stream saving + llama_data_file_context data_ctx(&file); + llama_copy_state_data_internal(ctx, &data_ctx); return true; } From 5f631c26794b6371fcf2660e8d0c53494a5575f7 Mon Sep 17 00:00:00 2001 From: Stephen Nichols Date: Fri, 4 Aug 2023 06:37:24 -0500 Subject: [PATCH 52/70] Fixing race condition in server and partial stream handling in frontend. (#2391) * Fixing race condition in server.cpp and partial stream handling in completion.js * Reverting assert edits. * Adding newline to eof --- examples/server/public/completion.js | 55 +++++++++++++++++++--------- examples/server/server.cpp | 6 ++- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index a43d5a7d56753..0c9bd5f1021db 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -43,6 +43,7 @@ export async function* llama(prompt, params = {}, config = {}) { const decoder = new TextDecoder(); let content = ""; + let leftover = ""; // Buffer for partially read lines try { let cont = true; @@ -53,29 +54,47 @@ export async function* llama(prompt, params = {}, config = {}) { break; } - // sse answers in the form multiple lines of: value\n with data always present as a key. in our case we - // mainly care about the data: key here, which we expect as json - const text = decoder.decode(result.value); + // Add any leftover data to the current chunk of data + const text = leftover + decoder.decode(result.value); - // parse all sse events and add them to result - const regex = /^(\S+):\s(.*)$/gm; - for (const match of text.matchAll(regex)) { - result[match[1]] = match[2] - } + // Check if the last character is a line break + const endsWithLineBreak = text.endsWith('\n'); - // since we know this is llama.cpp, let's just decode the json in data - result.data = JSON.parse(result.data); - content += result.data.content; + // Split the text into lines + let lines = text.split('\n'); - // yield - yield result; + // If the text doesn't end with a line break, then the last line is incomplete + // Store it in leftover to be added to the next chunk of data + if (!endsWithLineBreak) { + leftover = lines.pop(); + } else { + leftover = ""; // Reset leftover if we have a line break at the end + } - // if we got a stop token from server, we will break here - if (result.data.stop) { - if (result.data.generation_settings) { - generation_settings = result.data.generation_settings; + // Parse all sse events and add them to result + const regex = /^(\S+):\s(.*)$/gm; + for (const line of lines) { + const match = regex.exec(line); + if (match) { + result[match[1]] = match[2] + // since we know this is llama.cpp, let's just decode the json in data + if (result.data) { + result.data = JSON.parse(result.data); + content += result.data.content; + + // yield + yield result; + + // if we got a stop token from server, we will break here + if (result.data.stop) { + if (result.data.generation_settings) { + generation_settings = result.data.generation_settings; + } + cont = false; + break; + } + } } - break; } } } catch (e) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c0725088f1018..6f7a66da108c8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1274,7 +1274,11 @@ int main(int argc, char **argv) sink.done(); return true; }; - res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + const auto on_complete = [&](bool) { + llama.mutex.unlock(); + }; + lock.release(); + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); } }); svr.Get("/model.json", [&llama](const Request &, Response &res) From 18bb0ab1279033647a2a521a7d5d963ca967bc9a Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 4 Aug 2023 21:47:17 +0800 Subject: [PATCH 53/70] up ver, support 16k ctx --- expose.h | 2 +- gpttype_adapter.cpp | 3 ++- koboldcpp.py | 8 ++++---- otherarch/gpt2_v3.cpp | 3 ++- otherarch/gptj_v3.cpp | 2 +- otherarch/mpt_v3.cpp | 2 +- otherarch/neox_v3.cpp | 2 +- 7 files changed, 12 insertions(+), 10 deletions(-) diff --git a/expose.h b/expose.h index 2a8b674fabbf5..0905bc3ec8bb9 100644 --- a/expose.h +++ b/expose.h @@ -75,7 +75,7 @@ struct generation_inputs struct generation_outputs { int status = -1; - char text[16384]; //16kb should be enough for any response + char text[24576]; //24kb should be enough for any response }; extern std::string executable_path; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index daf56d983198f..e3941453f6cfe 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -374,7 +374,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in else { //approximate NTK aware ctx - rope_freq_base = (params.n_ctx <= 3072 ? 26000.0f : (params.n_ctx <= 4096 ? 32000.0f : (params.n_ctx <= 6144 ? 54000.0f : 82684.0f))); + rope_freq_base = (params.n_ctx <= 3072 ? 26000.0f : (params.n_ctx <= 4096 ? 32000.0f : (params.n_ctx <= 6144 ? 54000.0f : (params.n_ctx <= 8192 ? 82684.0f : (params.n_ctx <= 12288 ? 140000.0f : 200000.0f))))); + } printf("Using automatic RoPE scaling (scale:%.3f, base:%.1f)\n",rope_freq_scale,rope_freq_base); diff --git a/koboldcpp.py b/koboldcpp.py index 59b6d86e390bb..1e140b9bb8890 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -66,7 +66,7 @@ class generation_inputs(ctypes.Structure): class generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), - ("text", ctypes.c_char * 16384)] + ("text", ctypes.c_char * 24576)] handle = None @@ -303,7 +303,7 @@ def utfprint(str): maxhordelen = 256 modelbusy = threading.Lock() defaultport = 5001 -KcppVersion = "1.38" +KcppVersion = "1.39" showdebug = True showsamplerwarning = True showmaxctxwarning = True @@ -711,7 +711,7 @@ def show_new_gui(): # slider data blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024"] blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024"] - contextsize_text = ["512", "1024", "2048", "3072", "4096", "6144", "8192"] + contextsize_text = ["512", "1024", "2048", "3072", "4096", "6144", "8192", "12288", "16384"] runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib) or os.name == 'nt' and file_exists(opt + ".dll")] antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if not file_exists(lib) or os.name == 'nt' and not file_exists(opt + ".dll")] if not any(runopts): @@ -1713,7 +1713,7 @@ def main(args): parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0) parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') - parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048) + parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192,12288,16384], default=2048) parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512) parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+') parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true') diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index 608a61ac226d7..dd178ca021760 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -416,9 +416,10 @@ bool gpt2_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024; + static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024*(hparams.n_ctx>8192?2:1); static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024; + static void * scr0 = malloc(scr0_size); static void * scr1 = malloc(scr1_size); diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 91eb355bb5840..1cc566208f780 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -417,7 +417,7 @@ bool gptj_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 512u*1024*1024; + static size_t scr0_size = 512u*1024*1024*(hparams.n_ctx>8192?2:1); static size_t scr1_size = 512u*1024*1024; static void * scr0 = malloc(scr0_size); diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 211464f89c4d8..6d2b46568176a 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -354,7 +354,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way //MPT 30B needs more scratch memory - static size_t scr0_size = (n_embd>=7168?2048u:1024u)*1024*1024; + static size_t scr0_size = (n_embd>=7168?2048u:1024u)*1024*1024*(hparams.n_ctx>8192?2:1); static size_t scr1_size = (n_embd>=7168?2048u:1024u)*1024*1024; static void * scr0 = malloc(scr0_size); diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index fdcaed9bba8fb..61bc9230e90eb 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -433,7 +433,7 @@ bool gpt_neox_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024; + static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024*(hparams.n_ctx>8192?2:1); static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024; static void * scr0 = malloc(scr0_size); From 3498588e0fb4daf040c4e3c698595cb0bfd345c0 Mon Sep 17 00:00:00 2001 From: DannyDaemonic Date: Fri, 4 Aug 2023 08:20:12 -0700 Subject: [PATCH 54/70] Add --simple-io option for subprocesses and break out console.h and cpp (#1558) --- Makefile | 5 +- examples/CMakeLists.txt | 2 + examples/common.cpp | 377 +----------------------------- examples/common.h | 45 +--- examples/console.cpp | 494 ++++++++++++++++++++++++++++++++++++++++ examples/console.h | 19 ++ examples/main/main.cpp | 29 ++- 7 files changed, 536 insertions(+), 435 deletions(-) create mode 100644 examples/console.cpp create mode 100644 examples/console.h diff --git a/Makefile b/Makefile index a692a39ea85e0..e0528aeee24c0 100644 --- a/Makefile +++ b/Makefile @@ -340,6 +340,9 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-ut common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ +console.o: examples/console.cpp examples/console.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -353,7 +356,7 @@ clean: # Examples # -main: examples/main/main.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) +main: examples/main/main.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4b1f1cf44aaa1..a7b26776ad355 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,6 +13,8 @@ set(TARGET common) add_library(${TARGET} OBJECT common.h common.cpp + console.h + console.cpp grammar-parser.h grammar-parser.cpp ) diff --git a/examples/common.cpp b/examples/common.cpp index 3e7c3b696f539..21f4a0357d422 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -25,7 +25,6 @@ #else #include #include -#include #endif #if defined(_MSC_VER) @@ -329,6 +328,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.instruct = true; } else if (arg == "--multiline-input") { params.multiline_input = true; + } else if (arg == "--simple-io") { + params.simple_io = true; } else if (arg == "--color") { params.use_color = true; } else if (arg == "--mlock") { @@ -598,6 +599,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --mtest compute maximum memory usage\n"); fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stdout, " --verbose-prompt print prompt before generation\n"); + fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stdout, " -m FNAME, --model FNAME\n"); @@ -690,376 +692,3 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(model, lctx); } - -void console_init(console_state & con_st) { -#if defined(_WIN32) - // Windows-specific console initialization - DWORD dwMode = 0; - con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE); - if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) { - con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE); - if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) { - con_st.hConsole = NULL; - } - } - if (con_st.hConsole) { - // Enable ANSI colors on Windows 10+ - if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) { - SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING); - } - // Set console output codepage to UTF8 - SetConsoleOutputCP(CP_UTF8); - } - HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE); - if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) { - // Set console input codepage to UTF16 - _setmode(_fileno(stdin), _O_WTEXT); - - // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) - dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); - SetConsoleMode(hConIn, dwMode); - } -#else - // POSIX-specific console initialization - struct termios new_termios; - tcgetattr(STDIN_FILENO, &con_st.prev_state); - new_termios = con_st.prev_state; - new_termios.c_lflag &= ~(ICANON | ECHO); - new_termios.c_cc[VMIN] = 1; - new_termios.c_cc[VTIME] = 0; - tcsetattr(STDIN_FILENO, TCSANOW, &new_termios); - - con_st.tty = fopen("/dev/tty", "w+"); - if (con_st.tty != nullptr) { - con_st.out = con_st.tty; - } - - setlocale(LC_ALL, ""); -#endif -} - -void console_cleanup(console_state & con_st) { - // Reset console color - console_set_color(con_st, CONSOLE_COLOR_DEFAULT); - -#if !defined(_WIN32) - if (con_st.tty != nullptr) { - con_st.out = stdout; - fclose(con_st.tty); - con_st.tty = nullptr; - } - // Restore the terminal settings on POSIX systems - tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state); -#endif -} - -/* Keep track of current color of output, and emit ANSI code if it changes. */ -void console_set_color(console_state & con_st, console_color_t color) { - if (con_st.use_color && con_st.color != color) { - fflush(stdout); - switch(color) { - case CONSOLE_COLOR_DEFAULT: - fprintf(con_st.out, ANSI_COLOR_RESET); - break; - case CONSOLE_COLOR_PROMPT: - fprintf(con_st.out, ANSI_COLOR_YELLOW); - break; - case CONSOLE_COLOR_USER_INPUT: - fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN); - break; - case CONSOLE_COLOR_ERROR: - fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED); - break; - } - con_st.color = color; - fflush(con_st.out); - } -} - -char32_t getchar32() { -#if defined(_WIN32) - HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); - wchar_t high_surrogate = 0; - - while (true) { - INPUT_RECORD record; - DWORD count; - if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) { - return WEOF; - } - - if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) { - wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar; - if (wc == 0) { - continue; - } - - if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate - high_surrogate = wc; - continue; - } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate - if (high_surrogate != 0) { // Check if we have a high surrogate - return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000; - } - } - - high_surrogate = 0; // Reset the high surrogate - return static_cast(wc); - } - } -#else - wchar_t wc = getwchar(); - if (static_cast(wc) == WEOF) { - return WEOF; - } - -#if WCHAR_MAX == 0xFFFF - if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate - wchar_t low_surrogate = getwchar(); - if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate - return (static_cast(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000; - } - } - if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair - return 0xFFFD; // Return the replacement character U+FFFD - } -#endif - - return static_cast(wc); -#endif -} - -void pop_cursor(console_state & con_st) { -#if defined(_WIN32) - if (con_st.hConsole != NULL) { - CONSOLE_SCREEN_BUFFER_INFO bufferInfo; - GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo); - - COORD newCursorPosition = bufferInfo.dwCursorPosition; - if (newCursorPosition.X == 0) { - newCursorPosition.X = bufferInfo.dwSize.X - 1; - newCursorPosition.Y -= 1; - } else { - newCursorPosition.X -= 1; - } - - SetConsoleCursorPosition(con_st.hConsole, newCursorPosition); - return; - } -#endif - putc('\b', con_st.out); -} - -int estimateWidth(char32_t codepoint) { -#if defined(_WIN32) - return 1; -#else - return wcwidth(codepoint); -#endif -} - -int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) { -#if defined(_WIN32) - CONSOLE_SCREEN_BUFFER_INFO bufferInfo; - if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) { - // go with the default - return expectedWidth; - } - COORD initialPosition = bufferInfo.dwCursorPosition; - DWORD nNumberOfChars = length; - WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL); - - CONSOLE_SCREEN_BUFFER_INFO newBufferInfo; - GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo); - - // Figure out our real position if we're in the last column - if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) { - DWORD nNumberOfChars; - WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL); - GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo); - } - - int width = newBufferInfo.dwCursorPosition.X - initialPosition.X; - if (width < 0) { - width += newBufferInfo.dwSize.X; - } - return width; -#else - // we can trust expectedWidth if we've got one - if (expectedWidth >= 0 || con_st.tty == nullptr) { - fwrite(utf8_codepoint, length, 1, con_st.out); - return expectedWidth; - } - - fputs("\033[6n", con_st.tty); // Query cursor position - int x1, x2, y1, y2; - int results = 0; - results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1); - - fwrite(utf8_codepoint, length, 1, con_st.tty); - - fputs("\033[6n", con_st.tty); // Query cursor position - results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2); - - if (results != 4) { - return expectedWidth; - } - - int width = x2 - x1; - if (width < 0) { - // Calculate the width considering text wrapping - struct winsize w; - ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); - width += w.ws_col; - } - return width; -#endif -} - -void replace_last(console_state & con_st, char ch) { -#if defined(_WIN32) - pop_cursor(con_st); - put_codepoint(con_st, &ch, 1, 1); -#else - fprintf(con_st.out, "\b%c", ch); -#endif -} - -void append_utf8(char32_t ch, std::string & out) { - if (ch <= 0x7F) { - out.push_back(static_cast(ch)); - } else if (ch <= 0x7FF) { - out.push_back(static_cast(0xC0 | ((ch >> 6) & 0x1F))); - out.push_back(static_cast(0x80 | (ch & 0x3F))); - } else if (ch <= 0xFFFF) { - out.push_back(static_cast(0xE0 | ((ch >> 12) & 0x0F))); - out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); - out.push_back(static_cast(0x80 | (ch & 0x3F))); - } else if (ch <= 0x10FFFF) { - out.push_back(static_cast(0xF0 | ((ch >> 18) & 0x07))); - out.push_back(static_cast(0x80 | ((ch >> 12) & 0x3F))); - out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); - out.push_back(static_cast(0x80 | (ch & 0x3F))); - } else { - // Invalid Unicode code point - } -} - -// Helper function to remove the last UTF-8 character from a string -void pop_back_utf8_char(std::string & line) { - if (line.empty()) { - return; - } - - size_t pos = line.length() - 1; - - // Find the start of the last UTF-8 character (checking up to 4 bytes back) - for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) { - if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character - } - line.erase(pos); -} - -bool console_readline(console_state & con_st, std::string & line) { - console_set_color(con_st, CONSOLE_COLOR_USER_INPUT); - if (con_st.out != stdout) { - fflush(stdout); - } - - line.clear(); - std::vector widths; - bool is_special_char = false; - bool end_of_stream = false; - - char32_t input_char; - while (true) { - fflush(con_st.out); // Ensure all output is displayed before waiting for input - input_char = getchar32(); - - if (input_char == '\r' || input_char == '\n') { - break; - } - - if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) { - end_of_stream = true; - break; - } - - if (is_special_char) { - console_set_color(con_st, CONSOLE_COLOR_USER_INPUT); - replace_last(con_st, line.back()); - is_special_char = false; - } - - if (input_char == '\033') { // Escape sequence - char32_t code = getchar32(); - if (code == '[' || code == 0x1B) { - // Discard the rest of the escape sequence - while ((code = getchar32()) != (char32_t) WEOF) { - if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') { - break; - } - } - } - } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace - if (!widths.empty()) { - int count; - do { - count = widths.back(); - widths.pop_back(); - // Move cursor back, print space, and move cursor back again - for (int i = 0; i < count; i++) { - replace_last(con_st, ' '); - pop_cursor(con_st); - } - pop_back_utf8_char(line); - } while (count == 0 && !widths.empty()); - } - } else { - int offset = line.length(); - append_utf8(input_char, line); - int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char)); - if (width < 0) { - width = 0; - } - widths.push_back(width); - } - - if (!line.empty() && (line.back() == '\\' || line.back() == '/')) { - console_set_color(con_st, CONSOLE_COLOR_PROMPT); - replace_last(con_st, line.back()); - is_special_char = true; - } - } - - bool has_more = con_st.multiline_input; - if (is_special_char) { - replace_last(con_st, ' '); - pop_cursor(con_st); - - char last = line.back(); - line.pop_back(); - if (last == '\\') { - line += '\n'; - fputc('\n', con_st.out); - has_more = !has_more; - } else { - // llama will just eat the single space, it won't act as a space - if (line.length() == 1 && line.back() == ' ') { - line.clear(); - pop_cursor(con_st); - } - has_more = false; - } - } else { - if (end_of_stream) { - has_more = false; - } else { - line += '\n'; - fputc('\n', con_st.out); - } - } - - fflush(con_st.out); - return has_more; -} diff --git a/examples/common.h b/examples/common.h index 974484207251d..375bc0a3db416 100644 --- a/examples/common.h +++ b/examples/common.h @@ -11,11 +11,6 @@ #include #include -#if !defined (_WIN32) -#include -#include -#endif - // // CLI argument parsing // @@ -85,6 +80,7 @@ struct gpt_params { bool embedding = false; // get only sentence embedding bool interactive_first = false; // wait for user input immediately bool multiline_input = false; // reverse the usage of `\` + bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool instruct = false; // instruction mode (used for Alpaca models) @@ -116,42 +112,3 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s std::tuple llama_init_from_gpt_params(const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); - -// -// Console utils -// - -#define ANSI_COLOR_RED "\x1b[31m" -#define ANSI_COLOR_GREEN "\x1b[32m" -#define ANSI_COLOR_YELLOW "\x1b[33m" -#define ANSI_COLOR_BLUE "\x1b[34m" -#define ANSI_COLOR_MAGENTA "\x1b[35m" -#define ANSI_COLOR_CYAN "\x1b[36m" -#define ANSI_COLOR_RESET "\x1b[0m" -#define ANSI_BOLD "\x1b[1m" - -enum console_color_t { - CONSOLE_COLOR_DEFAULT=0, - CONSOLE_COLOR_PROMPT, - CONSOLE_COLOR_USER_INPUT, - CONSOLE_COLOR_ERROR -}; - -struct console_state { - bool multiline_input = false; - bool use_color = false; - console_color_t color = CONSOLE_COLOR_DEFAULT; - - FILE* out = stdout; -#if defined (_WIN32) - void* hConsole; -#else - FILE* tty = nullptr; - termios prev_state; -#endif -}; - -void console_init(console_state & con_st); -void console_cleanup(console_state & con_st); -void console_set_color(console_state & con_st, console_color_t color); -bool console_readline(console_state & con_st, std::string & line); diff --git a/examples/console.cpp b/examples/console.cpp new file mode 100644 index 0000000000000..4c32f3b09c31d --- /dev/null +++ b/examples/console.cpp @@ -0,0 +1,494 @@ +#include "console.h" +#include +#include + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" +#define ANSI_BOLD "\x1b[1m" + +namespace console { + + // + // Console state + // + + static bool advanced_display = false; + static bool simple_io = true; + static display_t current_display = reset; + + static FILE* out = stdout; + +#if defined (_WIN32) + static void* hConsole; +#else + static FILE* tty = nullptr; + static termios initial_state; +#endif + + // + // Init and cleanup + // + + void init(bool use_simple_io, bool use_advanced_display) { + advanced_display = use_advanced_display; + simple_io = use_simple_io; +#if defined(_WIN32) + // Windows-specific console initialization + DWORD dwMode = 0; + hConsole = GetStdHandle(STD_OUTPUT_HANDLE); + if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) { + hConsole = GetStdHandle(STD_ERROR_HANDLE); + if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) { + hConsole = nullptr; + simple_io = true; + } + } + if (hConsole) { + // Enable ANSI colors on Windows 10+ + if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) { + SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING); + } + // Set console output codepage to UTF8 + SetConsoleOutputCP(CP_UTF8); + } + HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE); + if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) { + // Set console input codepage to UTF16 + _setmode(_fileno(stdin), _O_WTEXT); + + if (!simple_io) { + // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) + dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); + } + if (!SetConsoleMode(hConIn, dwMode)) { + simple_io = true; + } + } +#else + // POSIX-specific console initialization + if (!simple_io) { + struct termios new_termios; + tcgetattr(STDIN_FILENO, &initial_state); + new_termios = initial_state; + new_termios.c_lflag &= ~(ICANON | ECHO); + new_termios.c_cc[VMIN] = 1; + new_termios.c_cc[VTIME] = 0; + tcsetattr(STDIN_FILENO, TCSANOW, &new_termios); + + tty = fopen("/dev/tty", "w+"); + if (tty != nullptr) { + out = tty; + } + } + + setlocale(LC_ALL, ""); +#endif + } + + void cleanup() { + // Reset console display + set_display(reset); + +#if !defined(_WIN32) + // Restore settings on POSIX systems + if (!simple_io) { + if (tty != nullptr) { + out = stdout; + fclose(tty); + tty = nullptr; + } + tcsetattr(STDIN_FILENO, TCSANOW, &initial_state); + } +#endif + } + + // + // Display and IO + // + + // Keep track of current display and only emit ANSI code if it changes + void set_display(display_t display) { + if (advanced_display && current_display != display) { + fflush(stdout); + switch(display) { + case reset: + fprintf(out, ANSI_COLOR_RESET); + break; + case prompt: + fprintf(out, ANSI_COLOR_YELLOW); + break; + case user_input: + fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN); + break; + case error: + fprintf(out, ANSI_BOLD ANSI_COLOR_RED); + } + current_display = display; + fflush(out); + } + } + + char32_t getchar32() { +#if defined(_WIN32) + HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); + wchar_t high_surrogate = 0; + + while (true) { + INPUT_RECORD record; + DWORD count; + if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) { + return WEOF; + } + + if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) { + wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar; + if (wc == 0) { + continue; + } + + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + high_surrogate = wc; + continue; + } + if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate + if (high_surrogate != 0) { // Check if we have a high surrogate + return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000; + } + } + + high_surrogate = 0; // Reset the high surrogate + return static_cast(wc); + } + } +#else + wchar_t wc = getwchar(); + if (static_cast(wc) == WEOF) { + return WEOF; + } + +#if WCHAR_MAX == 0xFFFF + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + wchar_t low_surrogate = getwchar(); + if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate + return (static_cast(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000; + } + } + if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair + return 0xFFFD; // Return the replacement character U+FFFD + } +#endif + + return static_cast(wc); +#endif + } + + void pop_cursor() { +#if defined(_WIN32) + if (hConsole != NULL) { + CONSOLE_SCREEN_BUFFER_INFO bufferInfo; + GetConsoleScreenBufferInfo(hConsole, &bufferInfo); + + COORD newCursorPosition = bufferInfo.dwCursorPosition; + if (newCursorPosition.X == 0) { + newCursorPosition.X = bufferInfo.dwSize.X - 1; + newCursorPosition.Y -= 1; + } else { + newCursorPosition.X -= 1; + } + + SetConsoleCursorPosition(hConsole, newCursorPosition); + return; + } +#endif + putc('\b', out); + } + + int estimateWidth(char32_t codepoint) { +#if defined(_WIN32) + return 1; +#else + return wcwidth(codepoint); +#endif + } + + int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { +#if defined(_WIN32) + CONSOLE_SCREEN_BUFFER_INFO bufferInfo; + if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) { + // go with the default + return expectedWidth; + } + COORD initialPosition = bufferInfo.dwCursorPosition; + DWORD nNumberOfChars = length; + WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL); + + CONSOLE_SCREEN_BUFFER_INFO newBufferInfo; + GetConsoleScreenBufferInfo(hConsole, &newBufferInfo); + + // Figure out our real position if we're in the last column + if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) { + DWORD nNumberOfChars; + WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL); + GetConsoleScreenBufferInfo(hConsole, &newBufferInfo); + } + + int width = newBufferInfo.dwCursorPosition.X - initialPosition.X; + if (width < 0) { + width += newBufferInfo.dwSize.X; + } + return width; +#else + // We can trust expectedWidth if we've got one + if (expectedWidth >= 0 || tty == nullptr) { + fwrite(utf8_codepoint, length, 1, out); + return expectedWidth; + } + + fputs("\033[6n", tty); // Query cursor position + int x1; + int y1; + int x2; + int y2; + int results = 0; + results = fscanf(tty, "\033[%d;%dR", &y1, &x1); + + fwrite(utf8_codepoint, length, 1, tty); + + fputs("\033[6n", tty); // Query cursor position + results += fscanf(tty, "\033[%d;%dR", &y2, &x2); + + if (results != 4) { + return expectedWidth; + } + + int width = x2 - x1; + if (width < 0) { + // Calculate the width considering text wrapping + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + width += w.ws_col; + } + return width; +#endif + } + + void replace_last(char ch) { +#if defined(_WIN32) + pop_cursor(); + put_codepoint(&ch, 1, 1); +#else + fprintf(out, "\b%c", ch); +#endif + } + + void append_utf8(char32_t ch, std::string & out) { + if (ch <= 0x7F) { + out.push_back(static_cast(ch)); + } else if (ch <= 0x7FF) { + out.push_back(static_cast(0xC0 | ((ch >> 6) & 0x1F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0xFFFF) { + out.push_back(static_cast(0xE0 | ((ch >> 12) & 0x0F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0x10FFFF) { + out.push_back(static_cast(0xF0 | ((ch >> 18) & 0x07))); + out.push_back(static_cast(0x80 | ((ch >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else { + // Invalid Unicode code point + } + } + + // Helper function to remove the last UTF-8 character from a string + void pop_back_utf8_char(std::string & line) { + if (line.empty()) { + return; + } + + size_t pos = line.length() - 1; + + // Find the start of the last UTF-8 character (checking up to 4 bytes back) + for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) { + if ((line[pos] & 0xC0) != 0x80) { + break; // Found the start of the character + } + } + line.erase(pos); + } + + bool readline_advanced(std::string & line, bool multiline_input) { + if (out != stdout) { + fflush(stdout); + } + + line.clear(); + std::vector widths; + bool is_special_char = false; + bool end_of_stream = false; + + char32_t input_char; + while (true) { + fflush(out); // Ensure all output is displayed before waiting for input + input_char = getchar32(); + + if (input_char == '\r' || input_char == '\n') { + break; + } + + if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) { + end_of_stream = true; + break; + } + + if (is_special_char) { + set_display(user_input); + replace_last(line.back()); + is_special_char = false; + } + + if (input_char == '\033') { // Escape sequence + char32_t code = getchar32(); + if (code == '[' || code == 0x1B) { + // Discard the rest of the escape sequence + while ((code = getchar32()) != (char32_t) WEOF) { + if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') { + break; + } + } + } + } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace + if (!widths.empty()) { + int count; + do { + count = widths.back(); + widths.pop_back(); + // Move cursor back, print space, and move cursor back again + for (int i = 0; i < count; i++) { + replace_last(' '); + pop_cursor(); + } + pop_back_utf8_char(line); + } while (count == 0 && !widths.empty()); + } + } else { + int offset = line.length(); + append_utf8(input_char, line); + int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char)); + if (width < 0) { + width = 0; + } + widths.push_back(width); + } + + if (!line.empty() && (line.back() == '\\' || line.back() == '/')) { + set_display(prompt); + replace_last(line.back()); + is_special_char = true; + } + } + + bool has_more = multiline_input; + if (is_special_char) { + replace_last(' '); + pop_cursor(); + + char last = line.back(); + line.pop_back(); + if (last == '\\') { + line += '\n'; + fputc('\n', out); + has_more = !has_more; + } else { + // llama will just eat the single space, it won't act as a space + if (line.length() == 1 && line.back() == ' ') { + line.clear(); + pop_cursor(); + } + has_more = false; + } + } else { + if (end_of_stream) { + has_more = false; + } else { + line += '\n'; + fputc('\n', out); + } + } + + fflush(out); + return has_more; + } + + bool readline_simple(std::string & line, bool multiline_input) { +#if defined(_WIN32) + std::wstring wline; + if (!std::getline(std::wcin, wline)) { + // Input stream is bad or EOF received + line.clear(); + GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0); + return false; + } + + int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL); + line.resize(size_needed); + WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL); +#else + if (!std::getline(std::cin, line)) { + // Input stream is bad or EOF received + line.clear(); + return false; + } +#endif + if (!line.empty()) { + char last = line.back(); + if (last == '/') { // Always return control on '/' symbol + line.pop_back(); + return false; + } + if (last == '\\') { // '\\' changes the default action + line.pop_back(); + multiline_input = !multiline_input; + } + } + line += '\n'; + + // By default, continue input if multiline_input is set + return multiline_input; + } + + bool readline(std::string & line, bool multiline_input) { + set_display(user_input); + + if (simple_io) { + return readline_simple(line, multiline_input); + } + return readline_advanced(line, multiline_input); + } + +} diff --git a/examples/console.h b/examples/console.h new file mode 100644 index 0000000000000..ec175269b9d8a --- /dev/null +++ b/examples/console.h @@ -0,0 +1,19 @@ +// Console functions + +#pragma once + +#include + +namespace console { + enum display_t { + reset = 0, + prompt, + user_input, + error + }; + + void init(bool use_simple_io, bool use_advanced_display); + void cleanup(); + void set_display(display_t display); + bool readline(std::string & line, bool multiline_input); +} diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 3796a92301136..56ada7e69d99d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -4,6 +4,7 @@ #endif #include "common.h" +#include "console.h" #include "llama.h" #include "build-info.h" #include "grammar-parser.h" @@ -35,9 +36,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static console_state con_st; static llama_context ** g_ctx; - static bool is_interacting = false; #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) @@ -46,7 +45,7 @@ void sigint_handler(int signo) { if (!is_interacting) { is_interacting=true; } else { - console_cleanup(con_st); + console::cleanup(); printf("\n"); llama_print_timings(*g_ctx); _exit(130); @@ -64,10 +63,8 @@ int main(int argc, char ** argv) { // save choice to use color for later // (note for later: this is a slightly awkward choice) - con_st.use_color = params.use_color; - con_st.multiline_input = params.multiline_input; - console_init(con_st); - atexit([]() { console_cleanup(con_st); }); + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); if (params.perplexity) { printf("\n************\n"); @@ -373,7 +370,7 @@ int main(int argc, char ** argv) { if (params.interactive) { const char *control_message; - if (con_st.multiline_input) { + if (params.multiline_input) { control_message = " - To return control to LLaMa, end your input with '\\'.\n" " - To return control without starting a new line, end your input with '/'.\n"; } else { @@ -401,7 +398,7 @@ int main(int argc, char ** argv) { int n_past_guidance = 0; // the first thing we will do is to output the prompt, so set color accordingly - console_set_color(con_st, CONSOLE_COLOR_PROMPT); + console::set_display(console::prompt); std::vector embd; std::vector embd_guidance; @@ -422,9 +419,9 @@ int main(int argc, char ** argv) { // Ensure the input doesn't exceed the context size by truncating embd if necessary. if ((int)embd.size() > max_embd_size) { auto skipped_tokens = embd.size() - max_embd_size; - console_set_color(con_st, CONSOLE_COLOR_ERROR); + console::set_display(console::error); printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); - console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + console::set_display(console::reset); fflush(stdout); embd.resize(max_embd_size); } @@ -667,7 +664,7 @@ int main(int argc, char ** argv) { } // reset color to default if we there is no pending user input if (input_echo && (int)embd_inp.size() == n_consumed) { - console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + console::set_display(console::reset); } // if not currently processing queued inputs; @@ -693,7 +690,7 @@ int main(int argc, char ** argv) { if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) { if (params.interactive) { is_interacting = true; - console_set_color(con_st, CONSOLE_COLOR_USER_INPUT); + console::set_display(console::user_input); } is_antiprompt = true; fflush(stdout); @@ -714,7 +711,7 @@ int main(int argc, char ** argv) { is_interacting = true; printf("\n"); - console_set_color(con_st, CONSOLE_COLOR_USER_INPUT); + console::set_display(console::user_input); fflush(stdout); } else if (params.instruct) { is_interacting = true; @@ -739,12 +736,12 @@ int main(int argc, char ** argv) { std::string line; bool another_line = true; do { - another_line = console_readline(con_st, line); + another_line = console::readline(line, params.multiline_input); buffer += line; } while (another_line); // done taking input, reset color - console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + console::set_display(console::reset); // Add tokens to embd only if the input buffer is non-empty // Entering a empty line lets the user pass control back From 02f9d96a866268700b8d8e7acbbcb4392c5ff345 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 4 Aug 2023 11:34:32 -0400 Subject: [PATCH 55/70] CUDA: check if event is NULL before cudaStreamWaitEvent (#2505) Fixes #2503 --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e0192bc6ecebc..4321e46741a40 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5203,7 +5203,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm if (split && g_device_count > 1) { CUDA_CHECK(cudaSetDevice(g_main_device)); for (int id = 0; id < g_device_count; ++id) { - if (id != g_main_device) { + if (id != g_main_device && src0_extra->events[id]) { CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id])); } } From 4329d1acb01c353803a54733b8eef9d93d0b84b2 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 4 Aug 2023 11:35:22 -0400 Subject: [PATCH 56/70] CUDA: use min compute capability of GPUs actually used (#2506) --- ggml-cuda.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4321e46741a40..d64d7045ccf3e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5347,7 +5347,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ } else { int min_compute_capability = INT_MAX; for (int id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_compute_capabilities[id]) { + if (min_compute_capability > g_compute_capabilities[id] + && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { min_compute_capability = g_compute_capabilities[id]; } } From 182af739c4ce237f7579facfe8f94dc53a1f573f Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 4 Aug 2023 15:00:57 -0400 Subject: [PATCH 57/70] server: regenerate completion.js.hpp (#2515) --- examples/server/completion.js.hpp | 613 ++++++++++++++++-------------- 1 file changed, 333 insertions(+), 280 deletions(-) diff --git a/examples/server/completion.js.hpp b/examples/server/completion.js.hpp index f399fb19a6bc8..f0a071a69bca1 100644 --- a/examples/server/completion.js.hpp +++ b/examples/server/completion.js.hpp @@ -87,289 +87,342 @@ unsigned char completion_js[] = { 0x20, 0x54, 0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, - 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, - 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, - 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x73, 0x65, 0x20, 0x61, - 0x6e, 0x73, 0x77, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, - 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, - 0x70, 0x6c, 0x65, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66, - 0x3a, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5c, 0x6e, 0x20, 0x77, 0x69, - 0x74, 0x68, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x61, 0x6c, 0x77, 0x61, - 0x79, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x74, 0x20, 0x61, - 0x73, 0x20, 0x61, 0x20, 0x6b, 0x65, 0x79, 0x2e, 0x20, 0x69, 0x6e, 0x20, - 0x6f, 0x75, 0x72, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x77, 0x65, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x61, 0x69, - 0x6e, 0x6c, 0x79, 0x20, 0x63, 0x61, 0x72, 0x65, 0x20, 0x61, 0x62, 0x6f, - 0x75, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a, - 0x20, 0x6b, 0x65, 0x79, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2c, 0x20, 0x77, - 0x68, 0x69, 0x63, 0x68, 0x20, 0x77, 0x65, 0x20, 0x65, 0x78, 0x70, 0x65, - 0x63, 0x74, 0x20, 0x61, 0x73, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, - 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, - 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72, 0x65, 0x73, - 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x61, - 0x72, 0x73, 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, - 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, - 0x64, 0x64, 0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, - 0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, - 0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, - 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20, - 0x74, 0x65, 0x78, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x41, 0x6c, - 0x6c, 0x28, 0x72, 0x65, 0x67, 0x65, 0x78, 0x29, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, - 0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, - 0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20, - 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, - 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, - 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73, - 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65, - 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74, - 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, - 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, - 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, - 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, - 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79, - 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, - 0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69, - 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73, - 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72, - 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77, - 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, - 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, - 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, - 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, - 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, - 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, - 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, - 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, - 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, - 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, - 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, - 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, - 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, - 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63, 0x72, - 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, - 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, - 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, - 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, - 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66, - 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, - 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, - 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, - 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, - 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28, - 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, - 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, - 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, - 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, - 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, - 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, - 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, - 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, - 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, - 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, - 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, - 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, - 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, - 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, - 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, - 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, - 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, - 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, - 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, - 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, - 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, - 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, - 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, - 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e, - 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, - 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, + 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, + 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, + 0x20, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x70, 0x61, 0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, + 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, + 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, + 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, + 0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, + 0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, + 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, + 0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, + 0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, + 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, + 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, + 0x65, 0x6e, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, + 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, + 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, + 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, + 0x64, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, + 0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, + 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, + 0x42, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, + 0x2e, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, + 0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, + 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, + 0x65, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, + 0x65, 0x78, 0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, + 0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, + 0x78, 0x74, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, + 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, + 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, + 0x65, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, + 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, + 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, + 0x20, 0x69, 0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, + 0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, + 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, + 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, + 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, + 0x28, 0x21, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, + 0x6e, 0x65, 0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, + 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, + 0x70, 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, + 0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, + 0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, + 0x65, 0x72, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, + 0x65, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, + 0x61, 0x6b, 0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, + 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, + 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, + 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, + 0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, + 0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, + 0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, + 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, + 0x65, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, + 0x68, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, + 0x65, 0x63, 0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, + 0x74, 0x63, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, + 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, + 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, + 0x6e, 0x63, 0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, + 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x2e, 0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, + 0x20, 0x6a, 0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, + 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, + 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, + 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, + 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, + 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, + 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, + 0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, + 0x61, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, + 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, + 0x2c, 0x20, 0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, + 0x65, 0x61, 0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, + 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, + 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, + 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, + 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, - 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, - 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, - 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, - 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, - 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, - 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, - 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, - 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, - 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e, - 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, - 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, - 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, - 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, - 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, - 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, - 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, - 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, - 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73, - 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a, - 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, - 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, - 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, - 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, - 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, - 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, - 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f, - 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, - 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, - 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, - 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, - 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, - 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, - 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, + 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, + 0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, + 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, + 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, + 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, + 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, + 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, + 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, + 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, + 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, + 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, + 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63, + 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, + 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, + 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, + 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, + 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, + 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, + 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, + 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, + 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, + 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, + 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, + 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, + 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, + 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, + 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, + 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, + 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, + 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50, - 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, - 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72, - 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, - 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, - 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, - 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, - 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, - 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, - 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, - 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65, - 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f, - 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65, - 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65, - 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, - 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, - 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, - 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, - 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74, - 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69, - 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, - 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69, - 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20, - 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, - 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, - 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, - 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, - 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e, - 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, + 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, + 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, + 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, + 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, + 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, + 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, + 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, + 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, + 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, + 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, + 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, + 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, + 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, + 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, + 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, + 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, + 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, + 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, + 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, + 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, + 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, + 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, + 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, + 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, + 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, + 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, + 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, + 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, + 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, + 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, + 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, + 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, + 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, + 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, + 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, + 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, + 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, + 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, + 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, + 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, + 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, + 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, + 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, + 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, + 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, + 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, + 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, + 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, + 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, + 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, + 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, + 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, + 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, + 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, + 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, + 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, + 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, + 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, + 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, + 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, + 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, + 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, + 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, + 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, + 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, + 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, + 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, + 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, + 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, + 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, + 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, + 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, + 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, + 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, + 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, + 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, + 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, + 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, + 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, + 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, + 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, + 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, + 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, + 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, + 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, + 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, + 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, + 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, + 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, + 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, + 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, + 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, + 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, + 0x2f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, + 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, + 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, - 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77, - 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, - 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29, - 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, - 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, - 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, - 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a + 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a }; -unsigned int completion_js_len = 4462; +unsigned int completion_js_len = 5099; From 332311234a0aa2974b2450710e22e09d90dd6b0b Mon Sep 17 00:00:00 2001 From: Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com> Date: Fri, 4 Aug 2023 20:16:11 +0000 Subject: [PATCH 58/70] fix firefox autoscroll (#2519) --- examples/server/index.html.hpp | 1089 ++++++++++++++--------------- examples/server/public/index.html | 5 +- 2 files changed, 547 insertions(+), 547 deletions(-) diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp index 5bca2a8ab03ac..bd0b4787068e0 100644 --- a/examples/server/index.html.hpp +++ b/examples/server/index.html.hpp @@ -554,600 +554,599 @@ unsigned char index_html[] = { 0x20, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20, 0x28, 0x69, 0x66, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, - 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20, - 0x26, 0x26, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, - 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x3c, 0x3d, - 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, - 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, - 0x6c, 0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, - 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x65, + 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x70, + 0x61, 0x72, 0x65, 0x6e, 0x74, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, + 0x20, 0x28, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, 0x26, 0x20, + 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, + 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x3c, 0x3d, 0x20, 0x70, + 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, + 0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x2b, 0x20, 0x33, 0x30, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, - 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, 0x28, - 0x30, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, - 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x4c, - 0x69, 0x6e, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x5b, 0x75, 0x73, 0x65, 0x72, - 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, - 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x7d, 0x3e, 0x3c, - 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x7d, - 0x3a, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x20, 0x3c, - 0x24, 0x7b, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, - 0x68, 0x7d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x6d, 0x73, 0x67, 0x29, 0x7d, - 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, - 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, - 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, - 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x63, - 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, - 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c, 0x61, 0x74, - 0x4d, 0x61, 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, - 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x60, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, - 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, - 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, - 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, - 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, - 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, - 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, - 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, - 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x28, - 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, - 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65, - 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x3d, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, + 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, + 0x28, 0x30, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, + 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x29, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x6d, 0x65, 0x73, + 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, + 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x5b, 0x75, 0x73, + 0x65, 0x72, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x29, 0x20, 0x3d, 0x3e, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, + 0x70, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x7d, + 0x3e, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, + 0x29, 0x7d, 0x3a, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, + 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, + 0x69, 0x73, 0x68, 0x7d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x6d, 0x73, 0x67, + 0x29, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, + 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, + 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, + 0x7b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, + 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c, + 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, + 0x6e, 0x65, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x60, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66, + 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, + 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, + 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, + 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, + 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, + 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, + 0x65, 0x5d, 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, + 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, - 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, - 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x20, 0x3d, - 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, - 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, - 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, - 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65, - 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x29, 0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x46, 0x6c, 0x6f, 0x61, - 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, 0x20, 0x6d, - 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, - 0x65, 0x70, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, - 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, - 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, - 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, - 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, - 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, - 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, - 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, - 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3d, - 0x22, 0x24, 0x7b, 0x73, 0x74, 0x65, 0x70, 0x7d, 0x22, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, - 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, + 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, + 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, + 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, + 0x5d, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, + 0x74, 0x28, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, + 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, + 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, + 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, + 0x5d, 0x3a, 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, + 0x72, 0x28, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, + 0x28, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x29, 0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x46, 0x6c, + 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, + 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, + 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, + 0x73, 0x74, 0x65, 0x70, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, + 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, + 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, + 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, + 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, + 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, + 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, + 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, + 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, + 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x73, 0x74, 0x65, + 0x70, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x74, 0x65, 0x70, 0x7d, 0x22, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, + 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, + 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, + 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, + 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, + 0x3d, 0x20, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, + 0x61, 0x78, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, + 0x65, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, - 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, - 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, - 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, - 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e, - 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, - 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, - 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, - 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e, - 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61, - 0x78, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, - 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, - 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, - 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, - 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, - 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, - 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, - 0x70, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3c, 0x2f, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, + 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, + 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, - 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, - 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, - 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, - 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, - 0x70, 0x74, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, - 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, - 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, - 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, - 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, - 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, + 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, + 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, + 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, + 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, + 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, + 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, + 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, + 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, + 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, + 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, + 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, + 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, + 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, - 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, - 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, + 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72, + 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, + 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, 0x79, 0x70, + 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, + 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, - 0x65, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, - 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, - 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, - 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, - 0x22, 0x62, 0x6f, 0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, - 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, - 0x3d, 0x22, 0x63, 0x68, 0x61, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, - 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, - 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, - 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, - 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, - 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, - 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, - 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, - 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, + 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, + 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68, 0x61, 0x74, 0x20, - 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, - 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x68, 0x69, 0x73, - 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, - 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, - 0x31, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, - 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, - 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, - 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, - 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x2d, 0x31, - 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, 0x5f, 0x70, - 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, - 0x63, 0x74, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, - 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, - 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, - 0x2e, 0x35, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x65, 0x6d, - 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x73, - 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x65, - 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x65, 0x6e, 0x61, - 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x20, - 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x22, 0x2c, 0x20, 0x6d, - 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, - 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, - 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, - 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, - 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, - 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x43, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, - 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x66, - 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x22, - 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, - 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, - 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, - 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, - 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x4b, 0x20, 0x73, - 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, - 0x78, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, - 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, - 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x7d, 0x29, 0x7d, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, + 0x6f, 0x72, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x3e, 0x55, 0x73, + 0x65, 0x72, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, + 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, + 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, + 0x75, 0x73, 0x65, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, + 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, + 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, - 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, - 0x6f, 0x70, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, - 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, - 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x70, - 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, - 0x6f, 0x70, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, - 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, + 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, + 0x72, 0x3d, 0x22, 0x62, 0x6f, 0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, + 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x3d, 0x22, 0x63, 0x68, 0x61, 0x72, 0x22, 0x20, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, + 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68, 0x61, + 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, + 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, + 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, + 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, + 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, + 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, + 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, + 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, + 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68, 0x61, + 0x74, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, + 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, + 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x68, + 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, + 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, + 0x73, 0x3d, 0x31, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, + 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x4d, - 0x6f, 0x72, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x3c, - 0x2f, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, - 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, - 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, - 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, - 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x46, 0x53, - 0x2d, 0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, - 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x66, 0x73, 0x5f, - 0x7a, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, - 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, - 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, - 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x79, 0x70, - 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, - 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, - 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, - 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x22, 0x2c, 0x20, - 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x69, - 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, - 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, - 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, - 0x73, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, - 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, - 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, 0x72, 0x65, 0x73, 0x65, - 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, - 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, - 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, - 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, - 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, + 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, + 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, + 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, + 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, + 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65, + 0x64, 0x69, 0x63, 0x74, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x46, 0x72, 0x65, 0x71, 0x75, 0x65, - 0x6e, 0x63, 0x79, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, - 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, - 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, - 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, + 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, + 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, + 0x20, 0x31, 0x2e, 0x35, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, + 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, + 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x66, 0x72, 0x65, - 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, - 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, - 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x72, 0x20, 0x2f, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, - 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x68, 0x72, 0x65, 0x65, 0x22, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, + 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, + 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, - 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, - 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, - 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3d, 0x22, 0x30, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, - 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x7d, 0x20, 0x6f, 0x6e, 0x69, - 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, - 0x2f, 0x3e, 0x20, 0x6e, 0x6f, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, + 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, + 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x65, + 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, + 0x74, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x22, 0x2c, + 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20, 0x6d, + 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, + 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, + 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, + 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, + 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, + 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x43, 0x6f, 0x6e, 0x73, 0x69, + 0x64, 0x65, 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, + 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, + 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, + 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, + 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x2c, 0x20, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, + 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x29, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, + 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x4b, + 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, + 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x2c, 0x20, 0x6d, 0x69, + 0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, + 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x22, 0x2c, 0x20, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x7d, + 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, + 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, + 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, + 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, + 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, + 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, + 0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, + 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, + 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, + 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69, + 0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, + 0x3e, 0x4d, 0x6f, 0x72, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, + 0x73, 0x3c, 0x2f, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, - 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, - 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, - 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, - 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, - 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x31, 0x7d, 0x20, 0x6f, 0x6e, - 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, - 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, - 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, - 0x20, 0x76, 0x31, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, + 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, + 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, + 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, + 0x46, 0x53, 0x2d, 0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, + 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, + 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x66, + 0x73, 0x5f, 0x7a, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, + 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, + 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x2e, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x7d, 0x29, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, + 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, + 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22, 0x2c, 0x20, 0x6d, + 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, + 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, + 0x20, 0x22, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x22, + 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, + 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, + 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, + 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, + 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61, + 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, + 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, + 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, 0x72, 0x65, + 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, + 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, + 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, + 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, + 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, + 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x46, 0x72, 0x65, 0x71, + 0x75, 0x65, 0x6e, 0x63, 0x79, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, + 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, + 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x66, 0x72, 0x65, 0x71, 0x75, + 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, + 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, + 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, + 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x66, + 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, + 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, + 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x72, 0x20, + 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, + 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x68, 0x72, 0x65, + 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x32, 0x22, 0x20, 0x63, 0x68, 0x65, + 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x30, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, - 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x7d, 0x20, 0x6f, + 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, - 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, - 0x74, 0x20, 0x76, 0x32, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, + 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x6e, 0x6f, 0x20, 0x4d, 0x69, 0x72, 0x6f, + 0x73, 0x74, 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, + 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, + 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, + 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x63, 0x68, + 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, + 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x31, 0x7d, 0x20, + 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, + 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, + 0x61, 0x74, 0x20, 0x76, 0x31, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, + 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, + 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, + 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, + 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x32, 0x22, 0x20, 0x63, + 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, + 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x7d, + 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, + 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, + 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, + 0x74, 0x61, 0x74, 0x20, 0x76, 0x32, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, + 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, + 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, + 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x74, 0x61, 0x75, + 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2e, 0x30, + 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, + 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74, + 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, + 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, - 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x74, 0x61, 0x75, 0x22, 0x2c, - 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2e, 0x30, 0x2c, 0x20, - 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, - 0x74, 0x5f, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, - 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, - 0x5f, 0x74, 0x61, 0x75, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, - 0x73, 0x74, 0x61, 0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x6d, - 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, - 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, - 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, - 0x74, 0x61, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, - 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, - 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, - 0x61, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, - 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, - 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x2f, 0x2f, 0x20, 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6e, 0x73, - 0x20, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, 0x72, 0x65, - 0x70, 0x6c, 0x61, 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x61, 0x72, 0x6b, - 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, 0x20, 0x28, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6d, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x74, 0x65, 0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x26, - 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, 0x27, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, - 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, 0x2c, 0x20, - 0x27, 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x28, 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, 0x74, 0x3b, + 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c, + 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, + 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, + 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, + 0x5f, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, + 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, + 0x65, 0x74, 0x61, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, + 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, + 0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, 0x61, + 0x6e, 0x73, 0x20, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, + 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x61, + 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, 0x20, + 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x6d, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x2e, 0x74, 0x65, 0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, + 0x2f, 0x26, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, - 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23, 0x7b, - 0x31, 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, - 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31, 0x3c, - 0x2f, 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, + 0x2c, 0x20, 0x27, 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, + 0x63, 0x65, 0x28, 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, + 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, + 0x23, 0x7b, 0x31, 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, + 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, + 0x31, 0x3c, 0x2f, 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, + 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, + 0x5c, 0x2a, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, + 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, + 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, - 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, - 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, - 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, - 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, - 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c, 0x20, - 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, - 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, + 0x2f, 0x5f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, + 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, + 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, + 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, + 0x3f, 0x29, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, + 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, - 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, - 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, - 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, - 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f, 0x67, - 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, - 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, - 0x60, 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73, 0x5c, - 0x53, 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c, 0x20, - 0x27, 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, - 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f, 0x70, - 0x72, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, - 0x60, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, - 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, - 0x64, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, - 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62, 0x72, - 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, - 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67, 0x65, - 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, - 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x5f, - 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d, 0x7d, - 0x20, 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x6c, - 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, + 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, + 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, + 0x2f, 0x60, 0x60, 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, + 0x73, 0x5c, 0x53, 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, + 0x2c, 0x20, 0x27, 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, + 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, + 0x2f, 0x70, 0x72, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, + 0x28, 0x2f, 0x60, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, + 0x20, 0x27, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, + 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, + 0x28, 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, + 0x62, 0x72, 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, - 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, + 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, + 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, + 0x20, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, + 0x7d, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, + 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, + 0x21, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, + 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, + 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, + 0x65, 0x72, 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, + 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, + 0x20, 0x70, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, + 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, + 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, + 0x6f, 0x6e, 0x64, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, + 0x32, 0x29, 0x7d, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, + 0x65, 0x72, 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, + 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, - 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, - 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, - 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74, 0x6f, - 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20, 0x70, - 0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24, 0x7b, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, - 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f, 0x6e, - 0x64, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32, 0x29, - 0x7d, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65, 0x72, - 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x70, - 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, - 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, - 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, - 0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, - 0x65, 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, 0x3d, - 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, - 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43, 0x68, - 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e, 0x66, - 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x6d, - 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22, 0x3e, + 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, + 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, + 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, + 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, + 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, - 0x6e, 0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, + 0x20, 0x3c, 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, + 0x74, 0x65, 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, + 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, + 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x2f, 0x6d, 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, + 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, + 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, + 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, + 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, + 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64, 0x65, - 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, - 0x6e, 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20, 0x62, - 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, - 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, - 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67, 0x61, - 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, - 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, - 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61, 0x20, - 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, - 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e, 0x67, - 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e, 0x3c, - 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, - 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70, 0x29, - 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x62, - 0x6f, 0x64, 0x79, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63, - 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, - 0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x3c, 0x2f, - 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, 0x6d, - 0x6c, 0x3e, 0x0a + 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, + 0x20, 0x62, 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, + 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, + 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, + 0x67, 0x61, 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, + 0x63, 0x70, 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, + 0x70, 0x70, 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, + 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, + 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, + 0x3e, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, + 0x2e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, + 0x70, 0x29, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, + 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, + 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, + 0x61, 0x64, 0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, + 0x3c, 0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, + 0x74, 0x6d, 0x6c, 0x3e, 0x0a }; -unsigned int index_html_len = 13791; +unsigned int index_html_len = 13781; diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 1db69a6bab024..de41da187a40f 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -282,8 +282,9 @@ useEffect(() => { // scroll to bottom (if needed) - if (container.current && container.current.scrollHeight <= container.current.scrollTop + container.current.offsetHeight + 300) { - container.current.scrollTo(0, container.current.scrollHeight) + const parent = container.current.parentElement; + if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) { + parent.scrollTo(0, parent.scrollHeight) } }, [messages]) From f514d1b306e1114c2884fcb25dd9bd48ae64ba32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 5 Aug 2023 18:20:44 +0200 Subject: [PATCH 59/70] CUDA: faster k-quant mul_mat_q kernels (#2525) --- ggml-cuda.cu | 889 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 518 insertions(+), 371 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d64d7045ccf3e..9d42efb0d0b03 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1383,8 +1383,10 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp sumi = __dp4a(vi1, u[2*i+1], sumi); } + const float2 ds8f = __half22float2(ds8); + // second part effectively subtracts 8 from each quant value - return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y)); + return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1410,12 +1412,14 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp } #ifdef GGML_CUDA_F16 - const half2 tmp = __hmul2(dm4, ds8); - const float d4d8 = __half2float(tmp.x); - const float m4s8 = __half2float(tmp.y); + const float2 tmp = __half22float2(__hmul2(dm4, ds8)); + const float d4d8 = tmp.x; + const float m4s8 = tmp.y; #else - const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x); - const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y); + const float2 dm4f = __half22float2(dm4); + const float2 ds8f = __half22float2(ds8); + const float d4d8 = dm4f.x * ds8f.x; + const float m4s8 = dm4f.y * ds8f.y; #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it @@ -1434,6 +1438,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; +#pragma unroll for (int i = 0; i < vdr; ++i) { int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 @@ -1450,8 +1455,10 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values } + const float2 ds8f = __half22float2(ds8); + // second part effectively subtracts 16 from each quant value - return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y)); + return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1466,6 +1473,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; +#pragma unroll for (int i = 0; i < vdr; ++i) { int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 @@ -1483,12 +1491,14 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp } #ifdef GGML_CUDA_F16 - const half2 tmp = __hmul2(dm5, ds8); - const float d5d8 = __half2float(tmp.x); - const float m5s8 = __half2float(tmp.y); + const float2 tmp = __half22float2(__hmul2(dm5, ds8)); + const float d5d8 = tmp.x; + const float m5s8 = tmp.y; #else - const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x); - const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y); + const float2 dm5f = __half22float2(dm5); + const float2 ds8f = __half22float2(ds8); + const float d5d8 = dm5f.x * ds8f.x; + const float m5s8 = dm5f.y * ds8f.y; #endif // GGML_CUDA_F16 // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it @@ -1503,17 +1513,18 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp #define VDR_Q8_0_Q8_1_MMQ 8 template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( - const int * v, const int * u, const float & d8_0, const half2 & ds8_1) { + const int * v, const int * u, const float & d8_0, const float & d8_1) { #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; +#pragma unroll for (int i = 0; i < vdr; ++i) { // SIMD dot product of quantized values sumi = __dp4a(v[i], u[i], sumi); } - return sumi * d8_0 * __half2float(ds8_1.x); + return d8_0*d8_1 * sumi; #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1525,18 +1536,21 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; +#pragma unroll for (int i = 0; i < vdr; ++i) { // SIMD dot product of quantized values sumi = __dp4a(v[i], u[i], sumi); } #ifdef GGML_CUDA_F16 - const half2 tmp = __hmul2(dm8, ds8); - const float d8d8 = __half2float(tmp.x); - const float m8s8 = __half2float(tmp.y); + const float2 tmp = __half22float2(__hmul2(dm8, ds8)); + const float d8d8 = tmp.x; + const float m8s8 = tmp.y; #else - const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x); - const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y); + const float2 dm8f = __half22float2(dm8); + const float2 ds8f = __half22float2(ds8); + const float d8d8 = dm8.x * ds8.x; + const float m8s8 = dm8.y * ds8.y; #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it @@ -1546,6 +1560,312 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } +#define VDR_Q2_K_Q8_1_MMVQ 1 +#define VDR_Q2_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( + const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm2, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + } + + const float2 dm2f = __half22float2(dm2); + + return dm2f.x*sumf_d - dm2f.y*sumf_m; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm2, const float & d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi_d = 0; + int sumi_m = 0; + +#pragma unroll + for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { + int sumi_d_sc = 0; + + const int sc = scales[i0 / (QI8_1/2)]; + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + +#pragma unroll + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m + } + + sumi_d += sumi_d_sc * (sc & 0xF); + } + + const float2 dm2f = __half22float2(dm2); + + return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q3_K_Q8_1_MMVQ 1 +#define VDR_Q3_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( + const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const int & scale_offset, const float & d3, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = __vsubss4(vil, vih); + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d3 * sumf; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d3, const float & d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { + int sumi_sc = 0; + + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product + } + + sumi += sumi_sc * scales[i0 / (QI8_1/2)]; + } + + return d3*d8 * sumi; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_K_Q8_1_MMVQ 2 +#define VDR_Q4_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +// also used for q5_K +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) { + int sumi_d = 0; + +#pragma unroll + for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) { + sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product + sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i0 / 4]); + + sumf_d += ds8f.x * (sc[i0/4] * sumi_d); + sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_K_Q8_1_MMVQ 2 +#define VDR_Q5_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( + const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; + + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; + + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; + + const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); + + } + + const float2 dm5f = __half22float2(dm5); + + return dm5f.x*sumf_d - dm5f.y*sumf_m; + +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q6_K_Q8_1_MMVQ 1 +#define VDR_Q6_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( + const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, + const float & d6, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { + int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + +#pragma unroll + for (int i = i0; i < i0 + 2; ++i) { + sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product + sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product + + sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product + sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product + } + + sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); + } + + return d6 * sumf_d; + +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + static __device__ __forceinline__ float vec_dot_q4_0_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { @@ -1631,6 +1951,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q4_0_Q8_1_MMQ == 0); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const float * x_dmf = (float *) x_dm; @@ -1729,6 +2050,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q4_1_Q8_1_MMQ == 0); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); @@ -1848,10 +2170,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q5_0_Q8_1_MMQ == 0); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; - const float * x_dmf = (float *) x_dm; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; int u[2*VDR_Q5_0_Q8_1_MMQ]; @@ -1862,7 +2186,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( } return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( @@ -1965,6 +2289,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q5_1_Q8_1_MMQ == 0); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; @@ -1989,12 +2314,13 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( int v[VDR_Q8_0_Q8_1_MMVQ]; int u[VDR_Q8_0_Q8_1_MMVQ]; +#pragma unroll for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { v[i] = get_int_from_int8(bq8_0->qs, iqs + i); u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); } - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds); + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds.x); } static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { @@ -2065,43 +2391,14 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q8_0_Q8_1_MMQ == 0); - const float * x_dmf = (float *) x_dm; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; return vec_dot_q8_0_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], - y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]); -} - -#define VDR_q2_K_q8_1 1 - -static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl( - const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, - const half2 & dm, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - float sumf_m = 0.0f; - - for (int i = 0; i < QR2_K; ++i) { - const int sc = scales[2*i]; - - const int vi = (v >> (2*i)) & 0x03030303; - - sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product - - int sc_high = sc >> 4; - sc_high |= sc_high << 8; - sc_high |= sc_high << 16; - sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values - } - - const float2 dmf = __half22float2(dm); - - return dmf.x*sumf_d - dmf.y*sumf_m; -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); } static __device__ __forceinline__ float vec_dot_q2_K_q8_1( @@ -2115,15 +2412,16 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( const uint8_t * scales = bq2_K->scales + scale_offset; const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); - int u[QR2_K]; + int u[QR2_K]; float d8[QR2_K]; +#pragma unroll for (int i = 0; i < QR2_K; ++ i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); d8[i] = bq8_1[bq8_offset + i].ds.x; } - return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8); + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); } static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { @@ -2204,62 +2502,26 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q2_K_Q8_1_MMQ == 0); - const int kbx = k / QI2_K; - const int kqsx = k % QI2_K; - - const int bq8_offset = QR2_K * (kqsx / QI8_1); - const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2); + const int kbx = k / QI2_K; + const int ky = (k % QI2_K) * QR2_K; + const float * y_df = (const float *) y_ds; - const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset; + int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; - int u[QR2_K]; - float d8[QR2_K]; + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); - for (int l = 0; l < QR2_K; ++ l) { - const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1; - u[l] = y_qs[y_qs_index]; - d8[l] = y_ds[y_qs_index / QI8_1].x; +#pragma unroll + for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { + v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; } - return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8); -} - -#define VDR_q3_K_q8_1 1 - -static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl( - const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, - const int & scale_offset, const float & d, const float * __restrict__ d8) { + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf = 0.0f; - - for (int i = 0; i < QR3_K; ++i) { - const int isc = scale_offset + 2*i; - - const int isc_low = isc % (QK_K/32); - const int sc_shift_low = 4 * (isc / (QK_K/32)); - const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; - - const int isc_high = isc % (QK_K/64); - const int sc_shift_high = 2 * (isc / (QK_K/64)); - const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; - - const int sc = (sc_low | sc_high) - 32; - - const int vil = (vl >> (2*i)) & 0x03030303; - - const int vih = ((vh >> i) << 2) & 0x04040404; - - const int vi = __vsubss4(vil, vih); - - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d*sumf; -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + const int index_y = j * (QR2_K*WARP_SIZE) + QR2_K*k; + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q3_K_q8_1( @@ -2277,15 +2539,16 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; - int u[QR3_K]; + int u[QR3_K]; float d8[QR3_K]; +#pragma unroll for (int i = 0; i < QR3_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); d8[i] = bq8_1[bq8_offset + i].ds.x; } - return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8); + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); } static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { @@ -2330,6 +2593,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; #pragma unroll for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) { @@ -2341,7 +2605,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d; + x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; } #pragma unroll @@ -2354,7 +2618,8 @@ template static __device__ __forceinline__ void load_tiles_q3_ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); - x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); } #pragma unroll @@ -2367,7 +2632,19 @@ template static __device__ __forceinline__ void load_tiles_q3_ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4)); + const int ksc = k % (QI3_K/4); + + const int ksc_low = ksc % (QI3_K/8); + const int shift_low = 4 * (ksc / (QI3_K/8)); + const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; + + const int ksc_high = QI3_K/8; + const int shift_high = 2 * ksc; + const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; + + const int sc = __vsubss4(sc_low | sc_high, 0x20202020); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; } } @@ -2381,57 +2658,31 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q3_K_Q8_1_MMQ == 0); const int kbx = k / QI3_K; - const int kqsx = k % QI3_K; + const int ky = (k % QI3_K) * QR3_K; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; - const int bq8_offset = QR3_K * (kqsx / (QI3_K/2)); - const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2); + const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; - const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16; + int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; - // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset; - - int u[QR3_K]; - float d8[QR3_K]; - - for (int l = 0; l < QR3_K; ++ l) { - const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1; - u[l] = y_qs[y_qs_index]; - d8[l] = y_ds[y_qs_index / QI8_1].x; - } - - return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset, - x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8); -} - -#define VDR_q4_K_q8_1 2 - -static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - float sumf_m = 0.0f; - - for (int i = 0; i < QR4_K; ++i) { - const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; - const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; +#pragma unroll + for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int shift = 2 * ((ky % 32) / 8); + const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; - const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vlh = (vh << 2) & 0x04040404; - sumf_d += d8[i] * (dot1 * sc[i]); - sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + v[l] = __vsubss4(vll, vlh); } - return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m; - -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + const int index_y = j * (QR3_K*WARP_SIZE) + k*QR3_K; + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q4_K_q8_1( @@ -2478,7 +2729,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( u[2*i+1] = q8[4]; } - return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8); + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); #else @@ -2566,7 +2817,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ } const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) { @@ -2591,7 +2842,15 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); - x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8)); + const int * scales = (int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; } } @@ -2605,76 +2864,20 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q4_K_Q8_1_MMQ == 0); - const int kbx = k / QI6_K; // == 0 if QK_K == 256 - const int kqsx = k % QI6_K; // == k if QK_K == 256 - - int v[2]; - int u[2*QR4_K]; - float d8[QR4_K]; - - // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6 - const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2)); - - v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0]; - v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4]; - - const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4]; - uint16_t aux[2]; - const int l = bq8_offset/2; - if (l < 2) { - aux[0] = scales[l+0] & 0x3f3f; - aux[1] = scales[l+2] & 0x3f3f; - } else { - aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2); - aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2); - } - const uint8_t * sc = (const uint8_t *)aux; - const uint8_t * m = sc + 2; - - for (int l = 0; l < QR4_K; ++l) { - const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2); - u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)]; - u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)]; - d8[l] = y_ds[kqsy / QI8_1].x; - } - - return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8); -} - -#define VDR_q5_K_q8_1 2 - -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( - const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - float sumf_m = 0.0f; - - for (int i = 0; i < QR5_K; ++i) { - const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; - const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; - - const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; - const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; - - const int v0i = vl0i | vh0i; - const int v1i = vl1i | vh1i; - - const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u - - sumf_d += d8[i] * (dot1 * sc[i]); - sumf_m += d8[i] * (dot2 * m[i]); + int v[QR4_K*VDR_Q4_K_Q8_1_MMQ]; +#pragma unroll + for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) { + v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F; + v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F; } - return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m; + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + const int index_y = j * (QR4_K*WARP_SIZE) + QR4_K*k; + return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_K_q8_1( @@ -2711,6 +2914,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const uint8_t * sc = (const uint8_t *)aux; const uint8_t * m = sc + 2; +#pragma unroll for (int i = 0; i < QR5_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; d8[i] = bq8i->ds.x; @@ -2767,14 +2971,12 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; - *x_qh = tile_x_qh; *x_sc = tile_x_sc; } @@ -2801,12 +3003,25 @@ template static __device__ __forceinline__ void load_tiles_q5_ } const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR5_K*kqsx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); + const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; + const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; + + const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; + const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; } const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) { @@ -2821,19 +3036,6 @@ template static __device__ __forceinline__ void load_tiles_q5_ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; } -#pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4); - - x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4)); - } - #pragma unroll for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; @@ -2844,7 +3046,15 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); - x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8)); + const int * scales = (int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; } } @@ -2858,71 +3068,13 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q5_K_Q8_1_MMQ == 0); - const int kbx = k / QI6_K; // == 0 if QK_K == 256 - const int kqsx = k % QI6_K; // == k if QK_K == 256 - - int vl[2]; - int vh[2]; - int u[2*QR4_K]; - float d8[QR4_K]; - - const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2)); - - vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0]; - vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4]; - - vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset; - vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset; - - const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4]; - uint16_t aux[2]; - const int l = bq8_offset/2; - if (l < 2) { - aux[0] = scales[l+0] & 0x3f3f; - aux[1] = scales[l+2] & 0x3f3f; - } else { - aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2); - aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2); - } - const uint8_t * sc = (const uint8_t *)aux; - const uint8_t * m = sc + 2; - - for (int l = 0; l < QR5_K; ++l) { - const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2); - u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)]; - u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)]; - d8[l] = y_ds[kqsy / QI8_1].x; - } - - return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8); -} - -#define VDR_q6_K_q8_1 1 - -static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl( - const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, - const float & d, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf = 0.0f; - - for (int i = 0; i < QR6_K; ++i) { - const int sc = scales[4*i]; - - const int vil = (vl >> (4*i)) & 0x0F0F0F0F; - - const int vih = ((vh >> (4*i)) << 4) & 0x30303030; - - const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 - - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product - } + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - return d*sumf; -#else - return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * (QR5_K*WARP_SIZE) + QR5_K*k; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q6_K_q8_1( @@ -2942,24 +3094,23 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( int u[QR6_K]; float d8[QR6_K]; +#pragma unroll for (int i = 0; i < QR6_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); d8[i] = bq8_1[bq8_offset + 2*i].ds.x; } - return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8); + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); } static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; + __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2]; __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; - *x_qh = tile_x_qh; *x_sc = tile_x_sc; } @@ -2986,12 +3137,26 @@ template static __device__ __forceinline__ void load_tiles_q6_ } const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR6_K*kqsx; + + const int ql = get_int_from_uint8(bxi->ql, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); + const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; + const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx); + const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; + const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); + x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); } const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + float * x_dmf = (float *) x_dm; #pragma unroll for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) { @@ -3003,20 +3168,7 @@ template static __device__ __forceinline__ void load_tiles_q6_ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d; - } - -#pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) { - int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); - - if (need_check) { - i = min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2); - - x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2)); + x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; } #pragma unroll @@ -3043,33 +3195,19 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( __builtin_assume(j < WARP_SIZE); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); + __builtin_assume(k % VDR_Q6_K_Q8_1_MMQ == 0); - const int kbx = k / QI6_K; // == 0 if QK_K == 256 - const int kqsx = k % QI6_K; // == k if QK_K == 256 - - const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4); - const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8); - const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)); - - const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift; - - const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8); - const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; - int u[QR6_K]; - float d8[QR6_K]; - - for (int l = 0; l < QR6_K; ++l) { - const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1; - u[l] = y_qs[kqsy]; - d8[l] = y_ds[kqsy / QI8_1].x; - } + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); - return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, - x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8); + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * (QR6_K*WARP_SIZE) + QR6_K*k; + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } -template static __global__ void mul_mat_q( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, @@ -3130,7 +3268,16 @@ static __global__ void mul_mat_q( const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE; const int kby = tid_x % blocks_per_tile_y_col; const int col_y_eff = min(col_y_0 + ids, ncols_y-1); - tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds; + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = (*dsi_src).x; + } } __syncthreads(); @@ -3780,7 +3927,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3789,7 +3936,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3798,7 +3945,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3807,7 +3954,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3816,7 +3963,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -3873,10 +4020,10 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3891,10 +4038,10 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3909,10 +4056,10 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3927,10 +4074,10 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3945,10 +4092,10 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3963,10 +4110,10 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat> + mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat> + mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3981,10 +4128,10 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat> + mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat> + mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -3999,10 +4146,10 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat> + mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat> + mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4017,10 +4164,10 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat> + mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat> + mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4035,10 +4182,10 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat> + mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat> + mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } From 2e8265ae1764d6288aab0e2df641909072e2d58e Mon Sep 17 00:00:00 2001 From: Keiichi Tabata Date: Sun, 6 Aug 2023 15:34:05 +0900 Subject: [PATCH 60/70] convert.py : add missing abstract methods for quantized data (#2491) --- convert.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/convert.py b/convert.py index ab6a4e10e49c3..f3bf1798089cc 100644 --- a/convert.py +++ b/convert.py @@ -465,6 +465,13 @@ def to_ggml(self) -> 'GGMLQuantizedTensor': def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor': return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type) + def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': + r = self.ndarray.shape[0] // 3 + return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head)) + + def part(self, n_part: int) -> 'UnquantizedTensor': + r = self.ndarray.shape[0] // 3 + return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor] From 86c32198954a2bc482025703d6875e11f1c2a574 Mon Sep 17 00:00:00 2001 From: DannyDaemonic Date: Sat, 5 Aug 2023 23:49:34 -0700 Subject: [PATCH 61/70] console : fix issue related to Windows 11 PowerShell console mode persistence (#2521) --- examples/console.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/console.cpp b/examples/console.cpp index 4c32f3b09c31d..8966b107f0797 100644 --- a/examples/console.cpp +++ b/examples/console.cpp @@ -80,8 +80,10 @@ namespace console { // Set console input codepage to UTF16 _setmode(_fileno(stdin), _O_WTEXT); - if (!simple_io) { - // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) + // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) + if (simple_io) { + dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT; + } else { dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); } if (!SetConsoleMode(hConIn, dwMode)) { From bcfdd0e6624e1d1411fdb686376cb6f24798ff34 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 6 Aug 2023 17:47:05 +0800 Subject: [PATCH 62/70] fixed bbs -1 and allow bbs = 2048 --- gpttype_adapter.cpp | 4 ++++ koboldcpp.py | 10 ++++++---- llama.cpp | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index e3941453f6cfe..4ae51ea23546c 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -347,6 +347,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in debugmode = inputs.debugmode; unbanTokens = inputs.unban_tokens; blasbatchsize = inputs.blasbatchsize; + if(blasbatchsize<=0) + { + blasbatchsize = 8; + } params.memory_f16 = inputs.f16_kv; params.n_ctx = inputs.max_context_length; diff --git a/koboldcpp.py b/koboldcpp.py index 1e140b9bb8890..04cba9f5b389e 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -709,8 +709,8 @@ def show_new_gui(): (lib_failsafe, "Failsafe Mode (Old CPU)")] openblas_option, clblast_option, cublas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) # slider data - blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024"] - blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024"] + blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024", "2048"] + blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024","2048"] contextsize_text = ["512", "1024", "2048", "3072", "4096", "6144", "8192", "12288", "16384"] runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib) or os.name == 'nt' and file_exists(opt + ".dll")] antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if not file_exists(lib) or os.name == 'nt' and not file_exists(opt + ".dll")] @@ -1306,7 +1306,7 @@ def guilaunch(): tk.Label(root, text = "(Note: KoboldCpp only works with GGML model formats!)", font = ("Arial", 9)).grid(row=1,column=0) - blasbatchopts = ["Don't Batch BLAS","BLAS = 32","BLAS = 64","BLAS = 128","BLAS = 256","BLAS = 512","BLAS = 1024"] + blasbatchopts = ["Don't Batch BLAS","BLAS = 32","BLAS = 64","BLAS = 128","BLAS = 256","BLAS = 512","BLAS = 1024","BLAS = 2048"] blaschoice = tk.StringVar() blaschoice.set("BLAS = 512") @@ -1416,6 +1416,8 @@ def onDropdownChange(event): args.blasbatchsize = 512 if selblaschoice==blasbatchopts[6]: args.blasbatchsize = 1024 + if selblaschoice==blasbatchopts[7]: + args.blasbatchsize = 2048 root = tk.Tk() root.attributes("-alpha", 0) @@ -1714,7 +1716,7 @@ def main(args): parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192,12288,16384], default=2048) - parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512) + parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512) parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+') parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true') parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true') diff --git a/llama.cpp b/llama.cpp index 371616d1f66c2..2d765ad88be4e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1042,7 +1042,7 @@ static void llama_model_load_internal( void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); - size_t blasbatchmul = (n_batch>512?2:1); + size_t blasbatchmul = (n_batch>512?(n_batch>1024?4:2):1); std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); From e99416cdfec542c11b9becf6b775fefd7c889593 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 6 Aug 2023 17:47:59 +0800 Subject: [PATCH 63/70] blasbatchsize --- koboldcpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 04cba9f5b389e..153cfc54447e4 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -923,7 +923,7 @@ def changerunmode(a,b,c): makelabelentry(quick_tab, "Threads:" , threads_var, 8, 50) # blas batch size - makeslider(quick_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 6, 12, set=5) + makeslider(quick_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 7, 12, set=5) # quick boxes quick_boxes = {"Launch Browser": launchbrowser , "High Priority" : highpriority, "Streaming Mode":stream, "Use SmartContext":smartcontext, "Unban Tokens":unbantokens, "Disable MMAP":disablemmap,} @@ -969,7 +969,7 @@ def changerunmode(a,b,c): # blas thread specifier makelabelentry(hardware_tab, "BLAS threads:" , blas_threads_var, 11, 50) # blas batch size - makeslider(hardware_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 6, 12, set=5) + makeslider(hardware_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 7, 12, set=5) # force version makelabelentry(hardware_tab, "Force Version:" , version_var, 100, 50) From 198cc826fcb968e47d402ba4f977c849b0a2076a Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 6 Aug 2023 22:19:18 +0800 Subject: [PATCH 64/70] updated lite --- klite.embd | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/klite.embd b/klite.embd index a3bb5be8591c6..5c8e1f5d1b782 100644 --- a/klite.embd +++ b/klite.embd @@ -1,6 +1,6 @@