From ea6cdccea16a4bdeb54d0f37b391b83a08d9e407 Mon Sep 17 00:00:00 2001
From: Chenxiaotao03 <chenxiaotao03@meituan.com>
Date: Mon, 15 Jan 2024 15:30:54 +0800
Subject: [PATCH 1/4] MobileVLM native implementation

---
 android/adb_run.sh                            |  53 +++
 android/build_64.sh                           |   8 +
 examples/llava/MobileVLM-README.md            | 131 ++++++
 examples/llava/clip.cpp                       | 388 +++++++++++++++++-
 .../llava/convert-image-encoder-to-gguf.py    |   6 +-
 ggml.c                                        | 345 +++++++++++++++-
 ggml.h                                        |  34 ++
 7 files changed, 939 insertions(+), 26 deletions(-)
 create mode 100755 android/adb_run.sh
 create mode 100755 android/build_64.sh
 create mode 100644 examples/llava/MobileVLM-README.md
diff --git a/android/adb_run.sh b/android/adb_run.sh
new file mode 100755
index 0000000000000..84ab887cd57c5
--- /dev/null
+++ b/android/adb_run.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
+projector_name="mmproj-model-f16.gguf"
+llama_name="ggml-model-q4_k.gguf"
+img_dir="/Users/cxt/model/llm"
+img_name="demo.jpg"
+prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+# img_name="cat.jpeg"
+# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+
+program_dir="build_64/bin"
+binName="llava-cli"
+n_threads=4
+
+
+deviceDir="/data/local/tmp"
+saveDir="output"
+if [ ! -d ${saveDir} ]; then
+    mkdir ${saveDir}
+fi
+
+
+function android_run() {
+    # # copy resource into device
+    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
+    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
+    # copy program into device
+    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
+    adb shell "chmod 0777 ${deviceDir}/${binName}"
+    
+    # run
+    adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
+    adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
+    adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
+}
+
+android_run
+
+echo "android_run is Done!"
\ No newline at end of file
diff --git a/android/build_64.sh b/android/build_64.sh
new file mode 100755
index 0000000000000..529fb291ed3cd
--- /dev/null
+++ b/android/build_64.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+cmake ../../ \
+-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release \
+-DANDROID_ABI="arm64-v8a" \
+-DANDROID_PLATFORM=android-23
+
+make -j4
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
new file mode 100644
index 0000000000000..1d18865ff2dd9
--- /dev/null
+++ b/examples/llava/MobileVLM-README.md
@@ -0,0 +1,131 @@
+# MobileVLM
+
+Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
+
+for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
+
+The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
+
+## Usage
+Build with cmake or run `make llava-cli` to build it.
+
+After building, run: `./llava-cli` to see the usage. For example:
+
+```sh
+./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
+    --image path/to/an/image.jpg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
+```
+
+## Model conversion
+
+- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+
+```sh
+git clone https://huggingface.co/mtgv/MobileVLM-1.7B
+
+git clone https://huggingface.co/openai/clip-vit-large-patch14-336
+```
+
+2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+
+```sh
+python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
+```
+
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
+
+```sh
+python ./examples/llava/convert-image-encoder-to-gguf \
+    -m path/to/clip-vit-large-patch14-336 \
+    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
+    --output-dir path/to/MobileVLM-1.7B \
+    --projector-type ldp
+```
+
+4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+
+```sh
+python ./convert.py path/to/MobileVLM-1.7B
+```
+
+5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
+```sh
+./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+```
+
+Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
+
+## Android compile and run
+### compile
+refer to `android/build_64.sh`
+```sh
+mkdir android/build_64
+cd android/build_64
+../build_64.sh
+```
+### run on Android
+refer to `android/adb_run.sh`, modify resources' `name` and `path`
+
+## some result on Android with `Snapdragon 888` chip
+### case 1
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/demo.jpg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
+ Susan Wise Bauer
+llama_print_timings:        load time =   23574.72 ms
+llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
+llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
+llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
+llama_print_timings:       total time =   34731.93 ms
+```
+### case 2
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/cat.jpeg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+```
+
+**output**
+```sh
+encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
+ The image depicts a cat sitting in the grass near some tall green plants.
+llama_print_timings:        load time =   23257.32 ms
+llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
+llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
+llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
+llama_print_timings:       total time =   34570.79 ms
+```
+
+## Minor shortcomings
+The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
+
+## TODO
+
+- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
+- [ ] Optimize LDP projector performance
+
+      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
+      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
+- [ ] run MobileVLM on `Jetson Orin`
+- [ ] Support more model variants, such as `MobileVLM-3B`.
+
+
+## contributor
+```sh
+zhangjidong05, yangyang260, huyiming03, chenxiaotao03
+```
\ No newline at end of file
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 2ae8853d3d5da..c900f5a2bd82c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -12,6 +12,7 @@
 #include <regex>
 #include <stdexcept>
 #include <vector>
+#include <sstream>
 
 #include "clip.h"
 #include "ggml.h"
@@ -67,6 +68,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_PATCH_SIZE "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN "clip.vision.image_mean"
 #define KEY_IMAGE_STD "clip.vision.image_std"
+#define KEY_PROJ_TYPE "clip.projector_type"
 
 //
 // tensor name constants
@@ -89,6 +91,21 @@ static std::string format(const char * fmt, ...) {
 #define TN_TEXT_PROJ "text_projection.weight"
 #define TN_VIS_PROJ "visual_projection.weight"
 #define TN_LLAVA_PROJ "mm.%d.%s"
+#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
+#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+
+
+enum projector_type {
+    PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+    { PROJECTOR_TYPE_MLP,           "mlp"     },
+    { PROJECTOR_TYPE_LDP,          "ldp"    },
+};
+
 
 //
 // utilities to get data from a gguf file
@@ -129,6 +146,91 @@ static std::string get_ftype(int ftype) {
     return ggml_type_name(static_cast<ggml_type>(ftype));
 }
 
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
+static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
+    size_t tensor_size = ggml_nbytes(tensor);
+    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n", 
+            prefix, ggml_n_dims(tensor), tensor->name, tensor_size, 
+            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->type);
+}
+
+static projector_type clip_projector_type_from_string(const std::string & name) {
+    for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+    return PROJECTOR_TYPE_UNKNOWN;
+}
+
 //
 // image data
 //
@@ -205,6 +307,32 @@ struct clip_vision_model {
     struct ggml_tensor * mm_0_b;
     struct ggml_tensor * mm_2_w;
     struct ggml_tensor * mm_2_b;
+
+    // MobileVLM projection
+    struct ggml_tensor * mm_model_mlp_1_w;
+    struct ggml_tensor * mm_model_mlp_1_b;
+    struct ggml_tensor * mm_model_mlp_3_w;
+    struct ggml_tensor * mm_model_mlp_3_b;
+    struct ggml_tensor * mm_model_block_1_block_0_0_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_1_block_2_0_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_b;
+    struct ggml_tensor * mm_model_block_2_block_0_0_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_2_block_2_0_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_b;
 };
 
 struct clip_ctx {
@@ -213,6 +341,7 @@ struct clip_ctx {
     bool has_llava_projector = false;
 
     struct clip_vision_model vision_model;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;
 
     float image_mean[3];
     float image_std[3];
@@ -430,16 +559,132 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             free(patches_data);
         }
 
+        // shape [1, 576, 1024]
+        // ne is whcn, ne = [1024, 576, 1, 1]
         embeddings = ggml_get_rows(ctx0, embeddings, patches);
 
-        // mm projection 0
-        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+        // print_tensor_info(embeddings, "embeddings");
+
+        // llava projector
+        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+            embeddings = ggml_gelu(ctx0, embeddings);
 
-        embeddings = ggml_gelu(ctx0, embeddings);
+            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projector
+            int n_patch = 24;
+            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+            mlp_1 = ggml_gelu(ctx0, mlp_1);
+            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+            // transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24]
+            mlp_3 = ggml_reshape_4d(ctx0, mlp_3, mlp_3->ne[0], n_patch, n_patch, mlp_3->ne[3]);
+            // permute logic is src idxs 0,1,2,3 perm to dst idxs
+            mlp_3 = ggml_permute_cpy(ctx0, mlp_3, 2, 0, 1, 3);
+            // mlp_3 shape = [1, 2048, 24, 24],  ne = [24, 24, 2048, 1]
+
+            // block 1
+            struct ggml_tensor * block_1 = nullptr;
+            {
+                // stride = 1, padding = 1, bias is nullptr
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
+                
+                // layer norm
+                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+                
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_1_block_2_0_w, 1, 1, 
+                        model.mm_model_block_1_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
+                block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
+
+                // layernorm
+                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // residual
+                block_1 = ggml_add(ctx0, mlp_3, block_1);
+            }
 
-        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+            // block_2
+            {
+                // stride = 2
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, nullptr, 2, 2, 1, 1, 1, 1);
+                
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // layer norm
+                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                // not sure the parameters is right for globalAvgPooling
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                
+                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_2_block_2_0_w, 1, 1, 
+                        model.mm_model_block_2_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
+                block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
+                // layernorm
+                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);                
+                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+            }
+            embeddings = block_1;
+        }
+        else {
+            GGML_ASSERT(false);
+        }
     }
 
     // build the graph
@@ -485,16 +730,55 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         printf("\n");
     }
     const int n_tensors = gguf_get_n_tensors(ctx);
+
     // kv
-    if (verbosity >= 3) {
-        const int n_kv = gguf_get_n_kv(ctx);
+    const int n_kv = gguf_get_n_kv(ctx);
+    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+        __func__, n_kv, n_tensors, fname);
+    {
+        std::map<enum ggml_type, uint32_t> n_type;
+
+        uint32_t n_type_max = 0;
+        enum ggml_type type_max = GGML_TYPE_F32;
 
-        for (int i = 0; i < n_kv; ++i) {
-            const char * key = gguf_get_key(ctx, i);
+        for (int i = 0; i < n_tensors; i++) {
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
 
-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            n_type[type]++;
+
+            if (n_type_max < n_type[type]) {
+                n_type_max = n_type[type];
+                type_max   = type;
+            }
+        }
+
+        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        for (int i = 0; i < n_kv; i++) {
+            const char * name           = gguf_get_key(ctx, i);
+            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
+            const std::string type_name =
+                type == GGUF_TYPE_ARRAY
+                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
+                : gguf_type_name(type);
+
+            std::string value          = gguf_kv_to_str(ctx, i);
+            const size_t MAX_VALUE_LEN = 40;
+            if (value.size() > MAX_VALUE_LEN) {
+                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+            }
+            replace_all(value, "\n", "\\n");
+
+            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+        }
+
+        // print type counts
+        for (auto & kv : n_type) {
+            if (kv.second == 0) {
+                continue;
+            }
+
+            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
         }
-        printf("\n");
     }
 
     // data
@@ -503,20 +787,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         for (int i = 0; i < n_tensors; ++i) {
             const char * name = gguf_get_tensor_name(ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
             struct ggml_tensor * cur = ggml_get_tensor(meta, name);
             size_t tensor_size = ggml_nbytes(cur);
             buffer_size += tensor_size;
             if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, offset);
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%d, %d, %d, %d], type: %d\n", __func__, i,
+                       ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], type);
             }
         }
     }
 
+
+
     buffer_size += n_tensors * 128 /* CLIP PADDING */;
 
     clip_ctx * new_clip = new clip_ctx;
 
+    // update projector type
+    {
+        int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
+        if (idx != -1) {
+            const std::string proj_type = gguf_get_val_str(ctx, idx);
+            new_clip->proj_type = clip_projector_type_from_string(proj_type);
+        }
+        else {
+            new_clip->proj_type = PROJECTOR_TYPE_MLP;
+        }
+    }
+
 #ifdef GGML_USE_CUBLAS
     new_clip->backend = ggml_backend_cuda_init(0);
     printf("%s: CLIP using CUDA backend\n", __func__);
@@ -661,10 +960,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
         vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
         vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
-        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+
+        // LLaVA projection
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+            vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+            vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projection
+            vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
+            vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
+            vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
+            vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
+            vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+            vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+            vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+            vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
+            vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
+            vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
+            vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
+            vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+            vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+            vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+            vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+            vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+            vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+            vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
+            vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
+            vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
+            vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
+            vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+            vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));    
+        }
+        else {
+            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
+            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+        }
 
         vision_model.layers.resize(hparams.n_layer);
         for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1100,13 +1434,25 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 }
 
 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
-    return ctx->vision_model.mm_2_b->ne[0];
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
+    }
+    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+        return ctx->vision_model.mm_2_b->ne[0];
+    }
+    else {
+        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
+        throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+    }
 }
 
 int clip_n_patches(const struct clip_ctx * ctx) {
     auto & params = ctx->vision_model.hparams;
-
-    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        n_patches /= 4;
+    }
+    return n_patches;
 }
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 03688e0ea1889..f5a3c9b46f9e3 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -81,6 +81,7 @@ def bytes_to_unicode():
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -174,6 +175,8 @@ def bytes_to_unicode():
     fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
     fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", args.projector_type)
 else:
     fout.add_description("two-tower CLIP model")
 
@@ -218,7 +221,8 @@ def bytes_to_unicode():
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
         name = get_tensor_name(name)
-        if data.ndim == 2:
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
             data = data.squeeze().numpy().astype(np.float16)
         else:
             data = data.squeeze().numpy().astype(np.float32)
diff --git a/ggml.c b/ggml.c
index ef5888ab21538..3befb4efe908c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1424,6 +1424,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
 inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
+// TODO: optimize performance
+inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@@ -1647,6 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CLAMP",
     "CONV_TRANSPOSE_1D",
     "IM2COL",
+    "CONV_DEPTHWISE_2D",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
@@ -1680,7 +1684,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1734,6 +1738,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "conv_transpose_1d(x)",
     "im2col(x)",
     "conv_transpose_2d(x)",
+    "conv_depthwise_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
@@ -1766,7 +1771,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -1782,9 +1787,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
     "GELU",
     "GELU_QUICK",
     "SILU",
+    "HARDSWISH",
+    "HARDSIGMOID",
 };
 
-static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
+static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
 
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -3951,6 +3958,20 @@ struct ggml_tensor * ggml_silu_back(
     return result;
 }
 
+// ggml hardswish
+struct ggml_tensor * ggml_hardswish(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
+}
+
+// ggml hardsigmoid
+struct ggml_tensor * ggml_hardsigmoid(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
+}
+
 // ggml_norm
 
 static struct ggml_tensor * ggml_norm_impl(
@@ -4759,6 +4780,24 @@ struct ggml_tensor * ggml_permute(
     return result;
 }
 
+// some operations don't support permuted tensor, so we need to copy it, to avoid this case
+struct ggml_tensor * ggml_permute_cpy(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   axis0,
+        int                   axis1,
+        int                   axis2,
+        int                   axis3) {
+    struct ggml_tensor * result = ggml_permute(ctx, a, axis0, axis1, axis2, axis3);
+    // new 4d tensor
+    struct ggml_tensor* tensor = ggml_new_tensor_4d(ctx, a->type, result->ne[0], result->ne[1], result->ne[2], result->ne[3]);
+
+    struct ggml_tensor* cpy = ggml_cpy(ctx, result, tensor);
+
+    return cpy;
+}
+
+
 // ggml_transpose
 
 struct ggml_tensor * ggml_transpose(
@@ -5350,6 +5389,52 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
     return result;
 }
 
+// ggml_conv_depthwise
+struct ggml_tensor * ggml_conv_depthwise_2d(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    struct ggml_tensor * b,
+    struct ggml_tensor * c,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1) {
+
+
+    const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
+    const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    const int64_t ne[4] = {
+        OW,
+        OH,
+        b->ne[2],
+        b->ne[3],
+    };
+    // GGML_ASSERT(a->ne[3] == b->ne[2]);
+    // GGML_ASSERT(a->ne[2] == 1);
+
+    // weight ne: [KW, KH, OC, 1]
+    GGML_ASSERT(a->ne[2] == b->ne[2]);
+    GGML_ASSERT(a->ne[3] == 1);
+    bool is_node = false;
+    /*
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+    */
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_CONV_DEPTHWISE_2D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+    return result;
+}
 // ggml_conv_2d
 
 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -9339,6 +9424,87 @@ static void ggml_compute_forward_silu_back(
     }
 }
 
+
+static void ggml_compute_forward_hardswish_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardswish_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+static void ggml_compute_forward_hardswish(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardswish_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardsigmoid_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+
 // ggml_compute_forward_norm
 
 static void ggml_compute_forward_norm_f32(
@@ -12363,6 +12529,160 @@ static void ggml_compute_forward_im2col(
     }
 }
 
+static void ggml_compute_forward_conv_depthwise_2d_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * src2,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // total patches in dst
+    const int np = ne2;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
+    const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
+    const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
+    const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
+    const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
+    const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
+
+    float* weight = (float*)(src0->data);
+    float* input = (float*)(src1->data);
+    // float* bias = (float*)(src2->data);
+    float* output = (float*)(dst->data);
+    for (int b = 0; b < ne13; ++b) {
+        for (int o_c = ip0; o_c < ip1; ++o_c) {
+            for (int o_h = 0; o_h < ne1; ++o_h) {
+                for (int o_w = 0; o_w < ne0; ++o_w) {
+                    float result_data = 0;
+                    int g = o_c;
+                    int i_c = g;
+                    for (int k_h = 0; k_h < ne01; ++k_h) {
+                        for (int k_w = 0; k_w < ne00; ++k_w) {
+                            int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
+                            int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
+                            if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
+                                continue;
+                            }
+                            float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
+                            float weight_data = weight[(g * ne01 + k_h) * ne00 + k_w];
+                            result_data += input_data * weight_data;
+                        }
+                    }
+                    // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
+                    output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
+                }
+            }
+        }
+    }
+
+}
+
+static void ggml_compute_forward_conv_depthwise_2d_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * src2,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // total patches in dst
+    const int np = ne2;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
+    const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
+    const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
+    const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
+    const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
+    const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
+
+    ggml_fp16_t* weight = (ggml_fp16_t*)(src0->data);
+    float* input = (float*)(src1->data);
+    // float* bias = (float*)(src2->data);
+    float* output = (float*)(dst->data);
+    for (int b = 0; b < ne13; ++b) {
+        for (int o_c = ip0; o_c < ip1; ++o_c) {
+            for (int o_h = 0; o_h < ne1; ++o_h) {
+                for (int o_w = 0; o_w < ne0; ++o_w) {
+                    float result_data = 0;
+                    int g = o_c;
+                    int i_c = g;
+                    for (int k_h = 0; k_h < ne01; ++k_h) {
+                        for (int k_w = 0; k_w < ne00; ++k_w) {
+                            int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
+                            int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
+                            if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
+                                continue;
+                            }
+                            float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
+                            float weight_data = GGML_FP16_TO_FP32(weight[(g * ne01 + k_h) * ne00 + k_w]);
+                            result_data += input_data * weight_data;
+                        }
+                    }
+                    // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
+                    output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
+                }
+            }
+        }
+    }
+
+}
+
+static void ggml_compute_forward_conv_depthwise_2d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * src2,
+              struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_conv_depthwise_2d_f32(params, src0, src1, src2, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_conv_depthwise_2d_f16_f32(params, src0, src1, src2, dst);
+                } else {
+                    GGML_ASSERT(false);
+                }
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_conv_transpose_2d
 
 static void ggml_compute_forward_conv_transpose_2d(
@@ -13931,6 +14251,14 @@ static void ggml_compute_forward_unary(
             {
                 ggml_compute_forward_silu(params, src0, dst);
             } break;
+        case GGML_UNARY_OP_HARDSWISH:
+            {
+                ggml_compute_forward_hardswish(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_HARDSIGMOID:
+            {
+                ggml_compute_forward_hardsigmoid(params, src0, dst);
+            } break;
         default:
             {
                 GGML_ASSERT(false);
@@ -14696,6 +15024,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
             } break;
+        case GGML_OP_CONV_DEPTHWISE_2D:
+            {
+                ggml_compute_forward_conv_depthwise_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+            } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
                 ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@@ -16344,6 +16676,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_ELU:
                 case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
+                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
                     {
                         n_tasks = 1;
                     } break;
@@ -16430,6 +16764,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = n_threads;
             } break;
+        case GGML_OP_CONV_DEPTHWISE_2D:
+            {
+                n_tasks = n_threads;
+            } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
                 n_tasks = n_threads;
@@ -16576,7 +16914,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             // distribute new work or execute it direct if 1T
             while (++node_n < cgraph->n_nodes) {
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
-
                 struct ggml_tensor * node = cgraph->nodes[node_n];
                 const int n_tasks = ggml_get_n_tasks(node, n_threads);
 
diff --git a/ggml.h b/ggml.h
index 1187074f7f174..1ca68c2f44de3 100644
--- a/ggml.h
+++ b/ggml.h
@@ -433,6 +433,7 @@ extern "C" {
         GGML_OP_CLAMP,
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_IM2COL,
+        GGML_OP_CONV_DEPTHWISE_2D,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
@@ -479,6 +480,8 @@ extern "C" {
         GGML_UNARY_OP_GELU,
         GGML_UNARY_OP_GELU_QUICK,
         GGML_UNARY_OP_SILU,
+        GGML_UNARY_OP_HARDSWISH,
+        GGML_UNARY_OP_HARDSIGMOID,
 
         GGML_UNARY_OP_COUNT,
     };
@@ -1022,6 +1025,16 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // hardswish(x) = x * relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardswish(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // hardsigmoid(x) = relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardsigmoid(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // normalize along rows
     GGML_API struct ggml_tensor * ggml_norm(
             struct ggml_context * ctx,
@@ -1284,6 +1297,15 @@ extern "C" {
             int                   axis2,
             int                   axis3);
 
+    // some operations don't support permuted tensor, so we need to copy it, to avoid this case
+    GGML_API struct ggml_tensor * ggml_permute_cpy(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   axis0,
+            int                   axis1,
+            int                   axis2,
+            int                   axis3);
+
     // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
     GGML_API struct ggml_tensor * ggml_transpose(
             struct ggml_context * ctx,
@@ -1473,6 +1495,18 @@ extern "C" {
             int                  d1,
             bool                 is_2D);
 
+    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1);
+
     GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,

From 9303bbf1b15baf30d857e4aff1aa84de152a549a Mon Sep 17 00:00:00 2001
From: Chenxiaotao03 <chenxiaotao03@meituan.com>
Date: Fri, 19 Jan 2024 12:50:01 +0800
Subject: [PATCH 2/4] delete depthwise_conv_2d and permute_cpy relative code,
 replace the two by the existed functions, and opt ldp definition, support
 LLAMA_PERF option for CMake

---
 CMakeLists.txt          |   7 ++
 android/build_64.sh     |   2 +-
 examples/llava/clip.cpp |  51 ++++-----
 ggml.c                  | 222 ++--------------------------------------
 ggml.h                  |  10 --
 5 files changed, 46 insertions(+), 246 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2741568ed3430..390bccc231c78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,6 +107,13 @@ option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STA
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 
+
+# add perf arguments
+option(LLAMA_PERF                            "llama: enable perf"                               OFF)
+if (LLAMA_PERF)
+    add_definitions(-DGGML_PERF)
+endif()
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 
diff --git a/android/build_64.sh b/android/build_64.sh
index 529fb291ed3cd..3982854e2c3f6 100755
--- a/android/build_64.sh
+++ b/android/build_64.sh
@@ -3,6 +3,6 @@ cmake ../../ \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DCMAKE_BUILD_TYPE=Release \
 -DANDROID_ABI="arm64-v8a" \
--DANDROID_PLATFORM=android-23
+-DANDROID_PLATFORM=android-23 $1
 
 make -j4
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index c900f5a2bd82c..34f36b4a0836a 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -583,25 +583,24 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             mlp_1 = ggml_gelu(ctx0, mlp_1);
             struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
             mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
-            // transpose from [1, 576, 2048] --> [1, 24, 24, 2048] --> [1, 2048, 24, 24]
-            mlp_3 = ggml_reshape_4d(ctx0, mlp_3, mlp_3->ne[0], n_patch, n_patch, mlp_3->ne[3]);
-            // permute logic is src idxs 0,1,2,3 perm to dst idxs
-            mlp_3 = ggml_permute_cpy(ctx0, mlp_3, 2, 0, 1, 3);
-            // mlp_3 shape = [1, 2048, 24, 24],  ne = [24, 24, 2048, 1]
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
 
             // block 1
             struct ggml_tensor * block_1 = nullptr;
             {
+                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
+                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                 // stride = 1, padding = 1, bias is nullptr
                 block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
                 
                 // layer norm
                 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                 // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                 block_1 = ggml_norm(ctx0, block_1, eps);
                 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
-                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                 
                 // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                 // hardswish
@@ -621,17 +620,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                 block_1 = ggml_mul(ctx0, block_1_hw, block_1);
 
-                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_1_block_2_0_w, 1, 1, 
-                        model.mm_model_block_1_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
-                block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
 
-                // layernorm
-                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
                 // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                 block_1 = ggml_norm(ctx0, block_1, eps);
                 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
-                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                 // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                 // residual
                 block_1 = ggml_add(ctx0, mlp_3, block_1);
@@ -644,11 +644,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 
                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                 // layer norm
-                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                 // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                 block_1 = ggml_norm(ctx0, block_1, eps);
                 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
-                block_1 = ggml_permute_cpy(ctx0, block_1, 2, 0, 1, 3);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                 // hardswish
                 struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
@@ -664,22 +664,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
                 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
                 block_1 = ggml_hardsigmoid(ctx0, block_1);
-                
+
                 // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                 block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                 block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                struct ggml_tensor* block_2_0_w_4d = ggml_reshape_4d(ctx0, model.mm_model_block_2_block_2_0_w, 1, 1, 
-                        model.mm_model_block_2_block_2_0_w->ne[0], model.mm_model_block_1_block_2_0_w->ne[1]);
-                block_1 = ggml_conv_2d(ctx0, block_2_0_w_4d, block_1, 1, 1, 0, 0, 1, 1);
-                // layernorm
-                block_1 = ggml_permute_cpy(ctx0, block_1, 1, 2, 0, 3);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
                 // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                 block_1 = ggml_norm(ctx0, block_1, eps);
                 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);                
                 block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
                 // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
-            }
+            }            
             embeddings = block_1;
         }
         else {
diff --git a/ggml.c b/ggml.c
index 3befb4efe908c..2e3849210778f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1650,7 +1650,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CLAMP",
     "CONV_TRANSPOSE_1D",
     "IM2COL",
-    "CONV_DEPTHWISE_2D",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
@@ -1684,7 +1683,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1738,7 +1737,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "conv_transpose_1d(x)",
     "im2col(x)",
     "conv_transpose_2d(x)",
-    "conv_depthwise_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
@@ -1771,7 +1769,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4780,24 +4778,6 @@ struct ggml_tensor * ggml_permute(
     return result;
 }
 
-// some operations don't support permuted tensor, so we need to copy it, to avoid this case
-struct ggml_tensor * ggml_permute_cpy(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   axis0,
-        int                   axis1,
-        int                   axis2,
-        int                   axis3) {
-    struct ggml_tensor * result = ggml_permute(ctx, a, axis0, axis1, axis2, axis3);
-    // new 4d tensor
-    struct ggml_tensor* tensor = ggml_new_tensor_4d(ctx, a->type, result->ne[0], result->ne[1], result->ne[2], result->ne[3]);
-
-    struct ggml_tensor* cpy = ggml_cpy(ctx, result, tensor);
-
-    return cpy;
-}
-
-
 // ggml_transpose
 
 struct ggml_tensor * ggml_transpose(
@@ -5402,37 +5382,18 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
     int                  d0,
     int                  d1) {
 
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
+                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
 
-    const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
-    const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-    const int64_t ne[4] = {
-        OW,
-        OH,
-        b->ne[2],
-        b->ne[3],
-    };
-    // GGML_ASSERT(a->ne[3] == b->ne[2]);
-    // GGML_ASSERT(a->ne[2] == 1);
+    struct ggml_tensor * result =
+        ggml_mul_mat(ctx,
+                ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1),                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+                ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
 
-    // weight ne: [KW, KH, OC, 1]
-    GGML_ASSERT(a->ne[2] == b->ne[2]);
-    GGML_ASSERT(a->ne[3] == 1);
-    bool is_node = false;
-    /*
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-    */
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
-    ggml_set_op_params(result, params, sizeof(params));
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
 
-    result->op = GGML_OP_CONV_DEPTHWISE_2D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
     return result;
 }
 // ggml_conv_2d
@@ -12529,159 +12490,6 @@ static void ggml_compute_forward_im2col(
     }
 }
 
-static void ggml_compute_forward_conv_depthwise_2d_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * src2,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // total patches in dst
-    const int np = ne2;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
-    const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
-    const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
-    const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
-    const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
-    const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
-    const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
-
-    float* weight = (float*)(src0->data);
-    float* input = (float*)(src1->data);
-    // float* bias = (float*)(src2->data);
-    float* output = (float*)(dst->data);
-    for (int b = 0; b < ne13; ++b) {
-        for (int o_c = ip0; o_c < ip1; ++o_c) {
-            for (int o_h = 0; o_h < ne1; ++o_h) {
-                for (int o_w = 0; o_w < ne0; ++o_w) {
-                    float result_data = 0;
-                    int g = o_c;
-                    int i_c = g;
-                    for (int k_h = 0; k_h < ne01; ++k_h) {
-                        for (int k_w = 0; k_w < ne00; ++k_w) {
-                            int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
-                            int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
-                            if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
-                                continue;
-                            }
-                            float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
-                            float weight_data = weight[(g * ne01 + k_h) * ne00 + k_w];
-                            result_data += input_data * weight_data;
-                        }
-                    }
-                    // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
-                    output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
-                }
-            }
-        }
-    }
-
-}
-
-static void ggml_compute_forward_conv_depthwise_2d_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * src2,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // total patches in dst
-    const int np = ne2;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
-    const int32_t stride_h = ggml_get_op_params_i32(dst, 0);
-    const int32_t stride_w = ggml_get_op_params_i32(dst, 1);
-    const int32_t pad_h = ggml_get_op_params_i32(dst, 2);
-    const int32_t pad_w = ggml_get_op_params_i32(dst, 3);
-    const int32_t dilation_h = ggml_get_op_params_i32(dst, 4);
-    const int32_t dilation_w = ggml_get_op_params_i32(dst, 5);
-
-    ggml_fp16_t* weight = (ggml_fp16_t*)(src0->data);
-    float* input = (float*)(src1->data);
-    // float* bias = (float*)(src2->data);
-    float* output = (float*)(dst->data);
-    for (int b = 0; b < ne13; ++b) {
-        for (int o_c = ip0; o_c < ip1; ++o_c) {
-            for (int o_h = 0; o_h < ne1; ++o_h) {
-                for (int o_w = 0; o_w < ne0; ++o_w) {
-                    float result_data = 0;
-                    int g = o_c;
-                    int i_c = g;
-                    for (int k_h = 0; k_h < ne01; ++k_h) {
-                        for (int k_w = 0; k_w < ne00; ++k_w) {
-                            int i_h = o_h * stride_h - pad_h + k_h * dilation_h;
-                            int i_w = o_w * stride_w - pad_w + k_w * dilation_w;
-                            if (i_h < 0 || i_h >= ne11 || i_w < 0 || i_w >= ne10) {
-                                continue;
-                            }
-                            float input_data = input[((b * ne12 + i_c) * ne11 + i_h) * ne10 + i_w];
-                            float weight_data = GGML_FP16_TO_FP32(weight[(g * ne01 + k_h) * ne00 + k_w]);
-                            result_data += input_data * weight_data;
-                        }
-                    }
-                    // output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data + bias[o_c];
-                    output[((b * ne2 + o_c) * ne1 + o_h) * ne0 + o_w] = result_data;
-                }
-            }
-        }
-    }
-
-}
-
-static void ggml_compute_forward_conv_depthwise_2d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * src2,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_depthwise_2d_f32(params, src0, src1, src2, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_conv_depthwise_2d_f16_f32(params, src0, src1, src2, dst);
-                } else {
-                    GGML_ASSERT(false);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
 
 // ggml_compute_forward_conv_transpose_2d
 
@@ -15024,10 +14832,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
             } break;
-        case GGML_OP_CONV_DEPTHWISE_2D:
-            {
-                ggml_compute_forward_conv_depthwise_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
-            } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
                 ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@@ -16764,10 +16568,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = n_threads;
             } break;
-        case GGML_OP_CONV_DEPTHWISE_2D:
-            {
-                n_tasks = n_threads;
-            } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
                 n_tasks = n_threads;
diff --git a/ggml.h b/ggml.h
index 1ca68c2f44de3..50e3882c8bb6d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -433,7 +433,6 @@ extern "C" {
         GGML_OP_CLAMP,
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_IM2COL,
-        GGML_OP_CONV_DEPTHWISE_2D,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
@@ -1297,15 +1296,6 @@ extern "C" {
             int                   axis2,
             int                   axis3);
 
-    // some operations don't support permuted tensor, so we need to copy it, to avoid this case
-    GGML_API struct ggml_tensor * ggml_permute_cpy(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   axis0,
-            int                   axis1,
-            int                   axis2,
-            int                   axis3);
-
     // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
     GGML_API struct ggml_tensor * ggml_transpose(
             struct ggml_context * ctx,

From c37859bf21ec3cafed63d4f5b83e3defbee30047 Mon Sep 17 00:00:00 2001
From: Chenxiaotao03 <chenxiaotao03@meituan.com>
Date: Sun, 21 Jan 2024 15:47:18 +0800
Subject: [PATCH 3/4] move android script to example/llava directory

---
 examples/llava/MobileVLM-README.md              | 6 +++---
 {android => examples/llava/android}/adb_run.sh  | 0
 {android => examples/llava/android}/build_64.sh | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename {android => examples/llava/android}/adb_run.sh (100%)
 rename {android => examples/llava/android}/build_64.sh (89%)

diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 1d18865ff2dd9..7965602b76565 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -59,10 +59,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo
 
 ## Android compile and run
 ### compile
-refer to `android/build_64.sh`
+refer to `examples/llava/android/build_64.sh`
 ```sh
-mkdir android/build_64
-cd android/build_64
+mkdir examples/llava/android/build_64
+cd examples/llava/android/build_64
 ../build_64.sh
 ```
 ### run on Android
diff --git a/android/adb_run.sh b/examples/llava/android/adb_run.sh
similarity index 100%
rename from android/adb_run.sh
rename to examples/llava/android/adb_run.sh
diff --git a/android/build_64.sh b/examples/llava/android/build_64.sh
similarity index 89%
rename from android/build_64.sh
rename to examples/llava/android/build_64.sh
index 3982854e2c3f6..71b6fd3f719cd 100755
--- a/android/build_64.sh
+++ b/examples/llava/android/build_64.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-cmake ../../ \
+cmake ../../../../ \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DCMAKE_BUILD_TYPE=Release \
 -DANDROID_ABI="arm64-v8a" \

From 0e57eb875e702db006ec1a09d5b04d41a0fd066f Mon Sep 17 00:00:00 2001
From: Chenxiaotao03 <chenxiaotao03@meituan.com>
Date: Mon, 22 Jan 2024 19:51:14 +0800
Subject: [PATCH 4/4] Fix the editor config checks

---
 examples/llava/MobileVLM-README.md |  2 +-
 examples/llava/android/adb_run.sh  |  4 ++--
 examples/llava/clip.cpp            | 18 +++++++++---------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 7965602b76565..c6258eba69a53 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -128,4 +128,4 @@ The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quic
 ## contributor
 ```sh
 zhangjidong05, yangyang260, huyiming03, chenxiaotao03
-```
\ No newline at end of file
+```
diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh
index 84ab887cd57c5..f73623ae3b129 100755
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -29,7 +29,7 @@ function android_run() {
     # copy program into device
     adb push ${program_dir}/${binName} ${deviceDir}/${binName}
     adb shell "chmod 0777 ${deviceDir}/${binName}"
-    
+
     # run
     adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
                                                  -m ${deviceDir}/${llama_name} \
@@ -50,4 +50,4 @@ function android_run() {
 
 android_run
 
-echo "android_run is Done!"
\ No newline at end of file
+echo "android_run is Done!"
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 34f36b4a0836a..6161fd858c29f 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -217,8 +217,8 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 
 static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
     size_t tensor_size = ggml_nbytes(tensor);
-    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n", 
-            prefix, ggml_n_dims(tensor), tensor->name, tensor_size, 
+    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n",
+            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
             tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->type);
 }
 
@@ -593,7 +593,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                 // stride = 1, padding = 1, bias is nullptr
                 block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
-                
+
                 // layer norm
                 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
@@ -601,11 +601,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 block_1 = ggml_norm(ctx0, block_1, eps);
                 block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
                 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                
+
                 // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                 // hardswish
                 struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
-                
+
                 block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
                 // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                 // pointwise conv
@@ -641,7 +641,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 // stride = 2
                 block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, nullptr, 2, 2, 1, 1, 1, 1);
-                
+
                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                 // layer norm
                 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
@@ -679,10 +679,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
                 // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                 block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);                
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
                 block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
                 // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
-            }            
+            }
             embeddings = block_1;
         }
         else {
@@ -996,7 +996,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
             vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
             vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
-            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));    
+            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
         }
         else {
             std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];