From ea6cdccea16a4bdeb54d0f37b391b83a08d9e407 Mon Sep 17 00:00:00 2001 From: Chenxiaotao03 Date: Mon, 15 Jan 2024 15:30:54 +0800 Subject: [PATCH 1/4] MobileVLM native implementation --- android/adb_run.sh | 53 +++ android/build_64.sh | 8 + examples/llava/MobileVLM-README.md | 131 ++++++ examples/llava/clip.cpp | 388 +++++++++++++++++- .../llava/convert-image-encoder-to-gguf.py | 6 +- ggml.c | 345 +++++++++++++++- ggml.h | 34 ++ 7 files changed, 939 insertions(+), 26 deletions(-) create mode 100755 android/adb_run.sh create mode 100755 android/build_64.sh create mode 100644 examples/llava/MobileVLM-README.md diff --git a/android/adb_run.sh b/android/adb_run.sh new file mode 100755 index 0000000000000..84ab887cd57c5 --- /dev/null +++ b/android/adb_run.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed" +projector_name="mmproj-model-f16.gguf" +llama_name="ggml-model-q4_k.gguf" +img_dir="/Users/cxt/model/llm" +img_name="demo.jpg" +prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" +# img_name="cat.jpeg" +# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" + +program_dir="build_64/bin" +binName="llava-cli" +n_threads=4 + + +deviceDir="/data/local/tmp" +saveDir="output" +if [ ! -d ${saveDir} ]; then + mkdir ${saveDir} +fi + + +function android_run() { + # # copy resource into device + # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name} + # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name} + adb push ${img_dir}/${img_name} ${deviceDir}/${img_name} + # copy program into device + adb push ${program_dir}/${binName} ${deviceDir}/${binName} + adb shell "chmod 0777 ${deviceDir}/${binName}" + + # run + adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \ + -m ${deviceDir}/${llama_name} \ + --mmproj ${deviceDir}/${projector_name} \ + -t ${n_threads} \ + --image ${deviceDir}/${img_name} \ + -p \"${prompt}\" \ + > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt" + adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \ + -m ${deviceDir}/${llama_name} \ + --mmproj ${deviceDir}/${projector_name} \ + -t ${n_threads} \ + --image ${deviceDir}/${img_name} \ + -p \"${prompt}\" \ + >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1" + adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir} +} + +android_run + +echo "android_run is Done!" \ No newline at end of file diff --git a/android/build_64.sh b/android/build_64.sh new file mode 100755 index 0000000000000..529fb291ed3cd --- /dev/null +++ b/android/build_64.sh @@ -0,0 +1,8 @@ +#!/bin/bash +cmake ../../ \ +-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ +-DCMAKE_BUILD_TYPE=Release \ +-DANDROID_ABI="arm64-v8a" \ +-DANDROID_PLATFORM=android-23 + +make -j4 diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md new file mode 100644 index 0000000000000..1d18865ff2dd9 --- /dev/null +++ b/examples/llava/MobileVLM-README.md @@ -0,0 +1,131 @@ +# MobileVLM + +Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants. + +for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) + +The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. + +## Usage +Build with cmake or run `make llava-cli` to build it. + +After building, run: `./llava-cli` to see the usage. For example: + +```sh +./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ + --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ + --image path/to/an/image.jpg \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" +``` + +## Model conversion + +- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally: + +```sh +git clone https://huggingface.co/mtgv/MobileVLM-1.7B + +git clone https://huggingface.co/openai/clip-vit-large-patch14-336 +``` + +2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: + +```sh +python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B +``` + +3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF: + +```sh +python ./examples/llava/convert-image-encoder-to-gguf \ + -m path/to/clip-vit-large-patch14-336 \ + --llava-projector path/to/MobileVLM-1.7B/llava.projector \ + --output-dir path/to/MobileVLM-1.7B \ + --projector-type ldp +``` + +4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: + +```sh +python ./convert.py path/to/MobileVLM-1.7B +``` + +5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` +```sh +./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s +``` + +Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. + +## Android compile and run +### compile +refer to `android/build_64.sh` +```sh +mkdir android/build_64 +cd android/build_64 +../build_64.sh +``` +### run on Android +refer to `android/adb_run.sh`, modify resources' `name` and `path` + +## some result on Android with `Snapdragon 888` chip +### case 1 +**input** +```sh +/data/local/tmp/llava-cli \ + -m /data/local/tmp/ggml-model-q4_k.gguf \ + --mmproj /data/local/tmp/mmproj-model-f16.gguf \ + -t 4 \ + --image /data/local/tmp/demo.jpg \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" +``` +**output** +```sh +encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch) + Susan Wise Bauer +llama_print_timings: load time = 23574.72 ms +llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second) +llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second) +llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second) +llama_print_timings: total time = 34731.93 ms +``` +### case 2 +**input** +```sh +/data/local/tmp/llava-cli \ + -m /data/local/tmp/ggml-model-q4_k.gguf \ + --mmproj /data/local/tmp/mmproj-model-f16.gguf \ + -t 4 \ + --image /data/local/tmp/cat.jpeg \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" +``` + +**output** +```sh +encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch) + The image depicts a cat sitting in the grass near some tall green plants. +llama_print_timings: load time = 23257.32 ms +llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second) +llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second) +llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second) +llama_print_timings: total time = 34570.79 ms +``` + +## Minor shortcomings +The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost. + +## TODO + +- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid` +- [ ] Optimize LDP projector performance + + - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`; + - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc. +- [ ] run MobileVLM on `Jetson Orin` +- [ ] Support more model variants, such as `MobileVLM-3B`. + + +## contributor +```sh +zhangjidong05, yangyang260, huyiming03, chenxiaotao03 +``` \ No newline at end of file diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 2ae8853d3d5da..c900f5a2bd82c 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "clip.h" #include "ggml.h" @@ -67,6 +68,7 @@ static std::string format(const char * fmt, ...) { #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" // // tensor name constants @@ -89,6 +91,21 @@ static std::string format(const char * fmt, ...) { #define TN_TEXT_PROJ "text_projection.weight" #define TN_VIS_PROJ "visual_projection.weight" #define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" +#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" + + +enum projector_type { + PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_UNKNOWN, +}; + +static std::map PROJECTOR_TYPE_NAMES = { + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, +}; + // // utilities to get data from a gguf file @@ -129,6 +146,91 @@ static std::string get_ftype(int ftype) { return ggml_type_name(static_cast(ftype)); } +static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { + switch (type) { + case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); + case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); + case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); + case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); + case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); + case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); + case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); + case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); + case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); + case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); + case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + default: return format("unknown type %d", type); + } +} + + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; + } + s = std::move(result); +} + +static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + + switch (type) { + case GGUF_TYPE_STRING: + return gguf_get_val_str(ctx_gguf, i); + case GGUF_TYPE_ARRAY: + { + const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); + int arr_n = gguf_get_arr_n(ctx_gguf, i); + const void * data = gguf_get_arr_data(ctx_gguf, i); + std::stringstream ss; + ss << "["; + for (int j = 0; j < arr_n; j++) { + if (arr_type == GGUF_TYPE_STRING) { + std::string val = gguf_get_arr_str(ctx_gguf, i, j); + // escape quotes + replace_all(val, "\\", "\\\\"); + replace_all(val, "\"", "\\\""); + ss << '"' << val << '"'; + } else if (arr_type == GGUF_TYPE_ARRAY) { + ss << "???"; + } else { + ss << gguf_data_to_str(arr_type, data, j); + } + if (j < arr_n - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + } + default: + return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); + } +} + +static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") { + size_t tensor_size = ggml_nbytes(tensor); + printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n", + prefix, ggml_n_dims(tensor), tensor->name, tensor_size, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->type); +} + +static projector_type clip_projector_type_from_string(const std::string & name) { + for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT + if (kv.second == name) { + return kv.first; + } + } + return PROJECTOR_TYPE_UNKNOWN; +} + // // image data // @@ -205,6 +307,32 @@ struct clip_vision_model { struct ggml_tensor * mm_0_b; struct ggml_tensor * mm_2_w; struct ggml_tensor * mm_2_b; + + // MobileVLM projection + struct ggml_tensor * mm_model_mlp_1_w; + struct ggml_tensor * mm_model_mlp_1_b; + struct ggml_tensor * mm_model_mlp_3_w; + struct ggml_tensor * mm_model_mlp_3_b; + struct ggml_tensor * mm_model_block_1_block_0_0_w; + struct ggml_tensor * mm_model_block_1_block_0_1_w; + struct ggml_tensor * mm_model_block_1_block_0_1_b; + struct ggml_tensor * mm_model_block_1_block_1_fc1_w; + struct ggml_tensor * mm_model_block_1_block_1_fc1_b; + struct ggml_tensor * mm_model_block_1_block_1_fc2_w; + struct ggml_tensor * mm_model_block_1_block_1_fc2_b; + struct ggml_tensor * mm_model_block_1_block_2_0_w; + struct ggml_tensor * mm_model_block_1_block_2_1_w; + struct ggml_tensor * mm_model_block_1_block_2_1_b; + struct ggml_tensor * mm_model_block_2_block_0_0_w; + struct ggml_tensor * mm_model_block_2_block_0_1_w; + struct ggml_tensor * mm_model_block_2_block_0_1_b; + struct ggml_tensor * mm_model_block_2_block_1_fc1_w; + struct ggml_tensor * mm_model_block_2_block_1_fc1_b; + struct ggml_tensor * mm_model_block_2_block_1_fc2_w; + struct ggml_tensor * mm_model_block_2_block_1_fc2_b; + struct ggml_tensor * mm_model_block_2_block_2_0_w; + struct ggml_tensor * mm_model_block_2_block_2_1_w; + struct ggml_tensor * mm_model_block_2_block_2_1_b; }; struct clip_ctx { @@ -213,6 +341,7 @@ struct clip_ctx { bool has_llava_projector = false; struct clip_vision_model vision_model; + projector_type proj_type = PROJECTOR_TYPE_MLP; float image_mean[3]; float image_std[3]; @@ -430,16 +559,132 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 free(patches_data); } + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] embeddings = ggml_get_rows(ctx0, embeddings, patches); - // mm projection 0 - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // print_tensor_info(embeddings, "embeddings"); + + // llava projector + if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_gelu(ctx0, embeddings); + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } + else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projector + int n_patch = 24; + struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); + mlp_1 = ggml_gelu(ctx0, mlp_1); + struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); + // transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24] + mlp_3 = ggml_reshape_4d(ctx0, mlp_3, mlp_3->ne[0], n_patch, n_patch, mlp_3->ne[3]); + // permute logic is src idxs 0,1,2,3 perm to dst idxs + mlp_3 = ggml_permute_cpy(ctx0, mlp_3, 2, 0, 1, 3); + // mlp_3 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + + // block 1 + struct ggml_tensor * block_1 = nullptr; + { + // stride = 1, padding = 1, bias is nullptr + block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1); + + // layer norm + // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); + block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // hardswish + struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_1_block_2_0_w, 1, 1, + model.mm_model_block_1_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]); + block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1); + + // layernorm + block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); + block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // residual + block_1 = ggml_add(ctx0, mlp_3, block_1); + } - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + // block_2 + { + // stride = 2 + block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, nullptr, 2, 2, 1, 1, 1, 1); + + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // layer norm + block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); + block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // hardswish + struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + // not sure the parameters is right for globalAvgPooling + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_2_block_2_0_w, 1, 1, + model.mm_model_block_2_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]); + block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1); + // layernorm + block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); + // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] + } + embeddings = block_1; + } + else { + GGML_ASSERT(false); + } } // build the graph @@ -485,16 +730,55 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("\n"); } const int n_tensors = gguf_get_n_tensors(ctx); + // kv - if (verbosity >= 3) { - const int n_kv = gguf_get_n_kv(ctx); + const int n_kv = gguf_get_n_kv(ctx); + printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + __func__, n_kv, n_tensors, fname); + { + std::map n_type; + + uint32_t n_type_max = 0; + enum ggml_type type_max = GGML_TYPE_F32; - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ctx, i); + for (int i = 0; i < n_tensors; i++) { + enum ggml_type type = gguf_get_tensor_type(ctx, i); - printf("%s: kv[%d]: key = %s\n", __func__, i, key); + n_type[type]++; + + if (n_type_max < n_type[type]) { + n_type_max = n_type[type]; + type_max = type; + } + } + + printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + for (int i = 0; i < n_kv; i++) { + const char * name = gguf_get_key(ctx, i); + const enum gguf_type type = gguf_get_kv_type(ctx, i); + const std::string type_name = + type == GGUF_TYPE_ARRAY + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i)) + : gguf_type_name(type); + + std::string value = gguf_kv_to_str(ctx, i); + const size_t MAX_VALUE_LEN = 40; + if (value.size() > MAX_VALUE_LEN) { + value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); + } + replace_all(value, "\n", "\\n"); + + printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + } + + // print type counts + for (auto & kv : n_type) { + if (kv.second == 0) { + continue; + } + + printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } - printf("\n"); } // data @@ -503,20 +787,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); + enum ggml_type type = gguf_get_tensor_type(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(meta, name); size_t tensor_size = ggml_nbytes(cur); buffer_size += tensor_size; if (verbosity >= 3) { - printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i, - ggml_n_dims(cur), cur->name, tensor_size, offset); + printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%d, %d, %d, %d], type: %d\n", __func__, i, + ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], type); } } } + + buffer_size += n_tensors * 128 /* CLIP PADDING */; clip_ctx * new_clip = new clip_ctx; + // update projector type + { + int idx = gguf_find_key(ctx, KEY_PROJ_TYPE); + if (idx != -1) { + const std::string proj_type = gguf_get_val_str(ctx, idx); + new_clip->proj_type = clip_projector_type_from_string(proj_type); + } + else { + new_clip->proj_type = PROJECTOR_TYPE_MLP; + } + } + #ifdef GGML_USE_CUBLAS new_clip->backend = ggml_backend_cuda_init(0); printf("%s: CLIP using CUDA backend\n", __func__); @@ -661,10 +960,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); - vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); - vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); - vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + + // LLaVA projection + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { + vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); + vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); + vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + } + else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projection + vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); + vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); + vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); + vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); + vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); + vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); + vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); + vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); + vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); + vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); + vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); + vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); + vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } + else { + std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); + } vision_model.layers.resize(hparams.n_layer); for (int il = 0; il < hparams.n_layer; ++il) { @@ -1100,13 +1434,25 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - return ctx->vision_model.mm_2_b->ne[0]; + if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + } + else if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + return ctx->vision_model.mm_2_b->ne[0]; + } + else { + std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); + } } int clip_n_patches(const struct clip_ctx * ctx) { auto & params = ctx->vision_model.hparams; - - return (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + n_patches /= 4; + } + return n_patches; } size_t clip_embd_nbytes(const struct clip_ctx * ctx) { diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 03688e0ea1889..f5a3c9b46f9e3 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -81,6 +81,7 @@ def bytes_to_unicode(): ap.add_argument("--clip_model_is_vision", action="store_true", required=False, help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) @@ -174,6 +175,8 @@ def bytes_to_unicode(): fout.add_description("vision-only CLIP model") elif has_llava_projector: fout.add_description("image encoder for LLaVA") + # add projector type + fout.add_string("clip.projector_type", args.projector_type) else: fout.add_description("two-tower CLIP model") @@ -218,7 +221,8 @@ def bytes_to_unicode(): projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name) - if data.ndim == 2: + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: data = data.squeeze().numpy().astype(np.float16) else: data = data.squeeze().numpy().astype(np.float32) diff --git a/ggml.c b/ggml.c index ef5888ab21538..3befb4efe908c 100644 --- a/ggml.c +++ b/ggml.c @@ -1424,6 +1424,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } +// TODO: optimize performance +inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } +inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } static const float GELU_COEF_A = 0.044715f; static const float GELU_QUICK_COEF = -1.702f; @@ -1647,6 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CLAMP", "CONV_TRANSPOSE_1D", "IM2COL", + "CONV_DEPTHWISE_2D", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", @@ -1680,7 +1684,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); +static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1734,6 +1738,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "conv_transpose_2d(x)", + "conv_depthwise_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", @@ -1766,7 +1771,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); +static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1782,9 +1787,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = { "GELU", "GELU_QUICK", "SILU", + "HARDSWISH", + "HARDSIGMOID", }; -static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10"); +static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); @@ -3951,6 +3958,20 @@ struct ggml_tensor * ggml_silu_back( return result; } +// ggml hardswish +struct ggml_tensor * ggml_hardswish( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH); +} + +// ggml hardsigmoid +struct ggml_tensor * ggml_hardsigmoid( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID); +} + // ggml_norm static struct ggml_tensor * ggml_norm_impl( @@ -4759,6 +4780,24 @@ struct ggml_tensor * ggml_permute( return result; } +// some operations don't support permuted tensor, so we need to copy it, to avoid this case +struct ggml_tensor * ggml_permute_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3) { + struct ggml_tensor * result = ggml_permute(ctx, a, axis0, axis1, axis2, axis3); + // new 4d tensor + struct ggml_tensor* tensor = ggml_new_tensor_4d(ctx, a->type, result->ne[0], result->ne[1], result->ne[2], result->ne[3]); + + struct ggml_tensor* cpy = ggml_cpy(ctx, result, tensor); + + return cpy; +} + + // ggml_transpose struct ggml_tensor * ggml_transpose( @@ -5350,6 +5389,52 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( return result; } +// ggml_conv_depthwise +struct ggml_tensor * ggml_conv_depthwise_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + + + const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + const int64_t ne[4] = { + OW, + OH, + b->ne[2], + b->ne[3], + }; + // GGML_ASSERT(a->ne[3] == b->ne[2]); + // GGML_ASSERT(a->ne[2] == 1); + + // weight ne: [KW, KH, OC, 1] + GGML_ASSERT(a->ne[2] == b->ne[2]); + GGML_ASSERT(a->ne[3] == 1); + bool is_node = false; + /* + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + */ + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + int32_t params[] = { s0, s1, p0, p1, d0, d1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_DEPTHWISE_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + return result; +} // ggml_conv_2d // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] @@ -9339,6 +9424,87 @@ static void ggml_compute_forward_silu_back( } } + +static void ggml_compute_forward_hardswish_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_hardswish_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} +static void ggml_compute_forward_hardswish( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_hardswish_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_hardsigmoid_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_hardsigmoid_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_hardsigmoid( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_hardsigmoid_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + // ggml_compute_forward_norm static void ggml_compute_forward_norm_f32( @@ -12363,6 +12529,160 @@ static void ggml_compute_forward_im2col( } } +static void ggml_compute_forward_conv_depthwise_2d_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + const int32_t stride_h = ggml_get_op_params_i32(dst, 0); + const int32_t stride_w = ggml_get_op_params_i32(dst, 1); + const int32_t pad_h = ggml_get_op_params_i32(dst, 2); + const int32_t pad_w = ggml_get_op_params_i32(dst, 3); + const int32_t dilation_h = ggml_get_op_params_i32(dst, 4); + const int32_t dilation_w = ggml_get_op_params_i32(dst, 5); + + float* weight = (float*)(src0->data); + float* input = (float*)(src1->data); + // float* bias = (float*)(src2->data); + float* output = (float*)(dst->data); + for (int b = 0; b < ne13; ++b) { + for (int o_c = ip0; o_c < ip1; ++o_c) { + for (int o_h = 0; o_h < ne1; ++o_h) { + for (int o_w = 0; o_w < ne0; ++o_w) { + float result_data = 0; + int g = o_c; + int i_c = g; + for (int k_h = 0; k_h < ne01; ++k_h) { + for (int k_w = 0; k_w < ne00; ++k_w) { + int i_h = o_h * stride_h - pad_h + k_h * dilation_h; + int i_w = o_w * stride_w - pad_w + k_w * dilation_w; + if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) { + continue; + } + float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w]; + float weight_data = weight[(g * ne01 + k_h) * ne00 + k_w]; + result_data += input_data * weight_data; + } + } + // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c]; + output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data; + } + } + } + } + +} + +static void ggml_compute_forward_conv_depthwise_2d_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + const int32_t stride_h = ggml_get_op_params_i32(dst, 0); + const int32_t stride_w = ggml_get_op_params_i32(dst, 1); + const int32_t pad_h = ggml_get_op_params_i32(dst, 2); + const int32_t pad_w = ggml_get_op_params_i32(dst, 3); + const int32_t dilation_h = ggml_get_op_params_i32(dst, 4); + const int32_t dilation_w = ggml_get_op_params_i32(dst, 5); + + ggml_fp16_t* weight = (ggml_fp16_t*)(src0->data); + float* input = (float*)(src1->data); + // float* bias = (float*)(src2->data); + float* output = (float*)(dst->data); + for (int b = 0; b < ne13; ++b) { + for (int o_c = ip0; o_c < ip1; ++o_c) { + for (int o_h = 0; o_h < ne1; ++o_h) { + for (int o_w = 0; o_w < ne0; ++o_w) { + float result_data = 0; + int g = o_c; + int i_c = g; + for (int k_h = 0; k_h < ne01; ++k_h) { + for (int k_w = 0; k_w < ne00; ++k_w) { + int i_h = o_h * stride_h - pad_h + k_h * dilation_h; + int i_w = o_w * stride_w - pad_w + k_w * dilation_w; + if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) { + continue; + } + float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w]; + float weight_data = GGML_FP16_TO_FP32(weight[(g * ne01 + k_h) * ne00 + k_w]); + result_data += input_data * weight_data; + } + } + // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c]; + output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data; + } + } + } + } + +} + +static void ggml_compute_forward_conv_depthwise_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_depthwise_2d_f32(params, src0, src1, src2, dst); + } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_conv_depthwise_2d_f16_f32(params, src0, src1, src2, dst); + } else { + GGML_ASSERT(false); + } + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_conv_transpose_2d static void ggml_compute_forward_conv_transpose_2d( @@ -13931,6 +14251,14 @@ static void ggml_compute_forward_unary( { ggml_compute_forward_silu(params, src0, dst); } break; + case GGML_UNARY_OP_HARDSWISH: + { + ggml_compute_forward_hardswish(params, src0, dst); + } break; + case GGML_UNARY_OP_HARDSIGMOID: + { + ggml_compute_forward_hardsigmoid(params, src0, dst); + } break; default: { GGML_ASSERT(false); @@ -14696,6 +15024,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor); } break; + case GGML_OP_CONV_DEPTHWISE_2D: + { + ggml_compute_forward_conv_depthwise_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -16344,6 +16676,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads + case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads { n_tasks = 1; } break; @@ -16430,6 +16764,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = n_threads; } break; + case GGML_OP_CONV_DEPTHWISE_2D: + { + n_tasks = n_threads; + } break; case GGML_OP_CONV_TRANSPOSE_2D: { n_tasks = n_threads; @@ -16576,7 +16914,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // distribute new work or execute it direct if 1T while (++node_n < cgraph->n_nodes) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); - struct ggml_tensor * node = cgraph->nodes[node_n]; const int n_tasks = ggml_get_n_tasks(node, n_threads); diff --git a/ggml.h b/ggml.h index 1187074f7f174..1ca68c2f44de3 100644 --- a/ggml.h +++ b/ggml.h @@ -433,6 +433,7 @@ extern "C" { GGML_OP_CLAMP, GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, + GGML_OP_CONV_DEPTHWISE_2D, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, @@ -479,6 +480,8 @@ extern "C" { GGML_UNARY_OP_GELU, GGML_UNARY_OP_GELU_QUICK, GGML_UNARY_OP_SILU, + GGML_UNARY_OP_HARDSWISH, + GGML_UNARY_OP_HARDSIGMOID, GGML_UNARY_OP_COUNT, }; @@ -1022,6 +1025,16 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // hardswish(x) = x * relu6(x + 3) / 6 + GGML_API struct ggml_tensor * ggml_hardswish( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // hardsigmoid(x) = relu6(x + 3) / 6 + GGML_API struct ggml_tensor * ggml_hardsigmoid( + struct ggml_context * ctx, + struct ggml_tensor * a); + // normalize along rows GGML_API struct ggml_tensor * ggml_norm( struct ggml_context * ctx, @@ -1284,6 +1297,15 @@ extern "C" { int axis2, int axis3); + // some operations don't support permuted tensor, so we need to copy it, to avoid this case + GGML_API struct ggml_tensor * ggml_permute_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3); + // alias for ggml_permute(ctx, a, 1, 0, 2, 3) GGML_API struct ggml_tensor * ggml_transpose( struct ggml_context * ctx, @@ -1473,6 +1495,18 @@ extern "C" { int d1, bool is_2D); + GGML_API struct ggml_tensor * ggml_conv_depthwise_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); + GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, From 9303bbf1b15baf30d857e4aff1aa84de152a549a Mon Sep 17 00:00:00 2001 From: Chenxiaotao03 Date: Fri, 19 Jan 2024 12:50:01 +0800 Subject: [PATCH 2/4] delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake --- CMakeLists.txt | 7 ++ android/build_64.sh | 2 +- examples/llava/clip.cpp | 51 ++++----- ggml.c | 222 ++-------------------------------------- ggml.h | 10 -- 5 files changed, 46 insertions(+), 246 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2741568ed3430..390bccc231c78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,6 +107,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ON) + +# add perf arguments +option(LLAMA_PERF "llama: enable perf" OFF) +if (LLAMA_PERF) + add_definitions(-DGGML_PERF) +endif() + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) diff --git a/android/build_64.sh b/android/build_64.sh index 529fb291ed3cd..3982854e2c3f6 100755 --- a/android/build_64.sh +++ b/android/build_64.sh @@ -3,6 +3,6 @@ cmake ../../ \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DCMAKE_BUILD_TYPE=Release \ -DANDROID_ABI="arm64-v8a" \ --DANDROID_PLATFORM=android-23 +-DANDROID_PLATFORM=android-23 $1 make -j4 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index c900f5a2bd82c..34f36b4a0836a 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -583,25 +583,24 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 mlp_1 = ggml_gelu(ctx0, mlp_1); struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); - // transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24] - mlp_3 = ggml_reshape_4d(ctx0, mlp_3, mlp_3->ne[0], n_patch, n_patch, mlp_3->ne[3]); - // permute logic is src idxs 0,1,2,3 perm to dst idxs - mlp_3 = ggml_permute_cpy(ctx0, mlp_3, 2, 0, 1, 3); - // mlp_3 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] // block 1 struct ggml_tensor * block_1 = nullptr; { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); + mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1); // layer norm // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); - block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] // hardswish @@ -621,17 +620,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); block_1 = ggml_mul(ctx0, block_1_hw, block_1); - // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_1_block_2_0_w, 1, 1, - model.mm_model_block_1_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]); - block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1); + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - // layernorm - block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); - block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] // residual block_1 = ggml_add(ctx0, mlp_3, block_1); @@ -644,11 +644,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // layer norm - block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); - block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // hardswish struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); @@ -664,22 +664,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); block_1 = ggml_hardsigmoid(ctx0, block_1); - + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); block_1 = ggml_mul(ctx0, block_1_hw, block_1); - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_2_block_2_0_w, 1, 1, - model.mm_model_block_2_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]); - block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1); - // layernorm - block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] - } + } embeddings = block_1; } else { diff --git a/ggml.c b/ggml.c index 3befb4efe908c..2e3849210778f 100644 --- a/ggml.c +++ b/ggml.c @@ -1650,7 +1650,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CLAMP", "CONV_TRANSPOSE_1D", "IM2COL", - "CONV_DEPTHWISE_2D", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", @@ -1684,7 +1683,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); +static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1738,7 +1737,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "conv_transpose_2d(x)", - "conv_depthwise_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", @@ -1771,7 +1769,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); +static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4780,24 +4778,6 @@ struct ggml_tensor * ggml_permute( return result; } -// some operations don't support permuted tensor, so we need to copy it, to avoid this case -struct ggml_tensor * ggml_permute_cpy( - struct ggml_context * ctx, - struct ggml_tensor * a, - int axis0, - int axis1, - int axis2, - int axis3) { - struct ggml_tensor * result = ggml_permute(ctx, a, axis0, axis1, axis2, axis3); - // new 4d tensor - struct ggml_tensor* tensor = ggml_new_tensor_4d(ctx, a->type, result->ne[0], result->ne[1], result->ne[2], result->ne[3]); - - struct ggml_tensor* cpy = ggml_cpy(ctx, result, tensor); - - return cpy; -} - - // ggml_transpose struct ggml_tensor * ggml_transpose( @@ -5402,37 +5382,18 @@ struct ggml_tensor * ggml_conv_depthwise_2d( int d0, int d1) { + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, + ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), + s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW] - const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); - const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); - const int64_t ne[4] = { - OW, - OH, - b->ne[2], - b->ne[3], - }; - // GGML_ASSERT(a->ne[3] == b->ne[2]); - // GGML_ASSERT(a->ne[2] == 1); + struct ggml_tensor * result = + ggml_mul_mat(ctx, + ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW] + ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] - // weight ne: [KW, KH, OC, 1] - GGML_ASSERT(a->ne[2] == b->ne[2]); - GGML_ASSERT(a->ne[3] == 1); - bool is_node = false; - /* - if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - */ - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - int32_t params[] = { s0, s1, p0, p1, d0, d1 }; - ggml_set_op_params(result, params, sizeof(params)); + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] - result->op = GGML_OP_CONV_DEPTHWISE_2D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b; - result->src[2] = c; return result; } // ggml_conv_2d @@ -12529,159 +12490,6 @@ static void ggml_compute_forward_im2col( } } -static void ggml_compute_forward_conv_depthwise_2d_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - - const int32_t stride_h = ggml_get_op_params_i32(dst, 0); - const int32_t stride_w = ggml_get_op_params_i32(dst, 1); - const int32_t pad_h = ggml_get_op_params_i32(dst, 2); - const int32_t pad_w = ggml_get_op_params_i32(dst, 3); - const int32_t dilation_h = ggml_get_op_params_i32(dst, 4); - const int32_t dilation_w = ggml_get_op_params_i32(dst, 5); - - float* weight = (float*)(src0->data); - float* input = (float*)(src1->data); - // float* bias = (float*)(src2->data); - float* output = (float*)(dst->data); - for (int b = 0; b < ne13; ++b) { - for (int o_c = ip0; o_c < ip1; ++o_c) { - for (int o_h = 0; o_h < ne1; ++o_h) { - for (int o_w = 0; o_w < ne0; ++o_w) { - float result_data = 0; - int g = o_c; - int i_c = g; - for (int k_h = 0; k_h < ne01; ++k_h) { - for (int k_w = 0; k_w < ne00; ++k_w) { - int i_h = o_h * stride_h - pad_h + k_h * dilation_h; - int i_w = o_w * stride_w - pad_w + k_w * dilation_w; - if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) { - continue; - } - float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w]; - float weight_data = weight[(g * ne01 + k_h) * ne00 + k_w]; - result_data += input_data * weight_data; - } - } - // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c]; - output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data; - } - } - } - } - -} - -static void ggml_compute_forward_conv_depthwise_2d_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - - const int32_t stride_h = ggml_get_op_params_i32(dst, 0); - const int32_t stride_w = ggml_get_op_params_i32(dst, 1); - const int32_t pad_h = ggml_get_op_params_i32(dst, 2); - const int32_t pad_w = ggml_get_op_params_i32(dst, 3); - const int32_t dilation_h = ggml_get_op_params_i32(dst, 4); - const int32_t dilation_w = ggml_get_op_params_i32(dst, 5); - - ggml_fp16_t* weight = (ggml_fp16_t*)(src0->data); - float* input = (float*)(src1->data); - // float* bias = (float*)(src2->data); - float* output = (float*)(dst->data); - for (int b = 0; b < ne13; ++b) { - for (int o_c = ip0; o_c < ip1; ++o_c) { - for (int o_h = 0; o_h < ne1; ++o_h) { - for (int o_w = 0; o_w < ne0; ++o_w) { - float result_data = 0; - int g = o_c; - int i_c = g; - for (int k_h = 0; k_h < ne01; ++k_h) { - for (int k_w = 0; k_w < ne00; ++k_w) { - int i_h = o_h * stride_h - pad_h + k_h * dilation_h; - int i_w = o_w * stride_w - pad_w + k_w * dilation_w; - if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) { - continue; - } - float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w]; - float weight_data = GGML_FP16_TO_FP32(weight[(g * ne01 + k_h) * ne00 + k_w]); - result_data += input_data * weight_data; - } - } - // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c]; - output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data; - } - } - } - } - -} - -static void ggml_compute_forward_conv_depthwise_2d( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, - struct ggml_tensor * dst) { - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_conv_depthwise_2d_f32(params, src0, src1, src2, dst); - } break; - case GGML_TYPE_F16: - { - if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_conv_depthwise_2d_f16_f32(params, src0, src1, src2, dst); - } else { - GGML_ASSERT(false); - } - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} // ggml_compute_forward_conv_transpose_2d @@ -15024,10 +14832,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor); } break; - case GGML_OP_CONV_DEPTHWISE_2D: - { - ggml_compute_forward_conv_depthwise_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); - } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -16764,10 +16568,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = n_threads; } break; - case GGML_OP_CONV_DEPTHWISE_2D: - { - n_tasks = n_threads; - } break; case GGML_OP_CONV_TRANSPOSE_2D: { n_tasks = n_threads; diff --git a/ggml.h b/ggml.h index 1ca68c2f44de3..50e3882c8bb6d 100644 --- a/ggml.h +++ b/ggml.h @@ -433,7 +433,6 @@ extern "C" { GGML_OP_CLAMP, GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, - GGML_OP_CONV_DEPTHWISE_2D, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, @@ -1297,15 +1296,6 @@ extern "C" { int axis2, int axis3); - // some operations don't support permuted tensor, so we need to copy it, to avoid this case - GGML_API struct ggml_tensor * ggml_permute_cpy( - struct ggml_context * ctx, - struct ggml_tensor * a, - int axis0, - int axis1, - int axis2, - int axis3); - // alias for ggml_permute(ctx, a, 1, 0, 2, 3) GGML_API struct ggml_tensor * ggml_transpose( struct ggml_context * ctx, From c37859bf21ec3cafed63d4f5b83e3defbee30047 Mon Sep 17 00:00:00 2001 From: Chenxiaotao03 Date: Sun, 21 Jan 2024 15:47:18 +0800 Subject: [PATCH 3/4] move android script to example/llava directory --- examples/llava/MobileVLM-README.md | 6 +++--- {android => examples/llava/android}/adb_run.sh | 0 {android => examples/llava/android}/build_64.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename {android => examples/llava/android}/adb_run.sh (100%) rename {android => examples/llava/android}/build_64.sh (89%) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 1d18865ff2dd9..7965602b76565 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -59,10 +59,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo ## Android compile and run ### compile -refer to `android/build_64.sh` +refer to `examples/llava/android/build_64.sh` ```sh -mkdir android/build_64 -cd android/build_64 +mkdir examples/llava/android/build_64 +cd examples/llava/android/build_64 ../build_64.sh ``` ### run on Android diff --git a/android/adb_run.sh b/examples/llava/android/adb_run.sh similarity index 100% rename from android/adb_run.sh rename to examples/llava/android/adb_run.sh diff --git a/android/build_64.sh b/examples/llava/android/build_64.sh similarity index 89% rename from android/build_64.sh rename to examples/llava/android/build_64.sh index 3982854e2c3f6..71b6fd3f719cd 100755 --- a/android/build_64.sh +++ b/examples/llava/android/build_64.sh @@ -1,5 +1,5 @@ #!/bin/bash -cmake ../../ \ +cmake ../../../../ \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DCMAKE_BUILD_TYPE=Release \ -DANDROID_ABI="arm64-v8a" \ From 0e57eb875e702db006ec1a09d5b04d41a0fd066f Mon Sep 17 00:00:00 2001 From: Chenxiaotao03 Date: Mon, 22 Jan 2024 19:51:14 +0800 Subject: [PATCH 4/4] Fix the editor config checks --- examples/llava/MobileVLM-README.md | 2 +- examples/llava/android/adb_run.sh | 4 ++-- examples/llava/clip.cpp | 18 +++++++++--------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 7965602b76565..c6258eba69a53 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -128,4 +128,4 @@ The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quic ## contributor ```sh zhangjidong05, yangyang260, huyiming03, chenxiaotao03 -``` \ No newline at end of file +``` diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh index 84ab887cd57c5..f73623ae3b129 100755 --- a/examples/llava/android/adb_run.sh +++ b/examples/llava/android/adb_run.sh @@ -29,7 +29,7 @@ function android_run() { # copy program into device adb push ${program_dir}/${binName} ${deviceDir}/${binName} adb shell "chmod 0777 ${deviceDir}/${binName}" - + # run adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \ -m ${deviceDir}/${llama_name} \ @@ -50,4 +50,4 @@ function android_run() { android_run -echo "android_run is Done!" \ No newline at end of file +echo "android_run is Done!" diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 34f36b4a0836a..6161fd858c29f 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -217,8 +217,8 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") { size_t tensor_size = ggml_nbytes(tensor); - printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n", - prefix, ggml_n_dims(tensor), tensor->name, tensor_size, + printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n", + prefix, ggml_n_dims(tensor), tensor->name, tensor_size, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->type); } @@ -593,7 +593,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1); - + // layer norm // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); @@ -601,11 +601,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 block_1 = ggml_norm(ctx0, block_1, eps); block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] // hardswish struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] // pointwise conv @@ -641,7 +641,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { // stride = 2 block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, nullptr, 2, 2, 1, 1, 1, 1); - + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // layer norm block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); @@ -679,10 +679,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] - } + } embeddings = block_1; } else { @@ -996,7 +996,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); - vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];