Tencent · nihui · Apr 9, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
@@ -30,8 +30,10 @@
 * [Dropout](#dropout)
 * [Eltwise](#eltwise)
 * [ELU](#elu)
+* [Embed](#embed)
 * [Exp](#exp)
 * [Flatten](#flatten)
+* [Fold](#fold)
 * [GELU](#gelu)
 * [GLU](#glu)
 * [Gemm](#gemm)
@@ -84,6 +86,7 @@
 * [Threshold](#threshold)
 * [Tile](#tile)
 * [UnaryOp](#unaryop)
+* [Unfold](#unfold)
 
 # AbsVal
 ```
@@ -474,12 +477,15 @@ y = crop(x)
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | woffset       | int   | 0         |                   |
 | 1         | hoffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 1         |                   |
-| 3         | outw          | int   | 1         |                   |
+| 13        | doffset       | int   | 0         |                   |
+| 2         | coffset       | int   | 0         |                   |
+| 3         | outw          | int   | 0         |                   |
 | 4         | outh          | int   | 0         |                   |
+| 14        | outd          | int   | 0         |                   |
 | 5         | outc          | int   | 0         |                   |
 | 6         | woffset2      | int   | 0         |                   |
-| 7         | hoffset2      | int   | 1         |                   |
+| 7         | hoffset2      | int   | 0         |                   |
+| 15        | doffset2      | int   | 0         |                   |
 | 8         | coffset2      | int   | 0         |                   |
 | 9         | starts        | array | [ ]       |                   |
 | 10        | ends          | array | [ ]       |                   |
@@ -819,6 +825,23 @@ else        y = x
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | alpha         | float | 0.1f      |                   |
 
+# Embed
+```
+y = embedding(x)
+```
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | num_output    | int   | 0         |                   |
+| 1         | input_dim     | int   | 0         |                   |
+| 2         | bias_term     | int   | 0         |                   |
+| 3         | weight_data_size | int | 0        |                   |
+
+| weight        | type  | shape                 |
+| ------------- | ----- | --------------------- |
+| weight_data   | float | [weight_data_size]    |
+| bias_term     | float | [num_output]          |
+
 # Exp
 ```
 if base == -1   y = exp(shift + x * scale)
@@ -839,6 +862,29 @@ Reshape blob to 1 dimension
 
 * one_blob_only
 
+# Fold
+```
+y = fold(x)
+```
+
+* one_blob_only
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | num_output    | int   | 0         |                   |
+| 1         | kernel_w      | int   | 0         |                   |
+| 2         | dilation_w    | int   | 1         |                   |
+| 3         | stride_w      | int   | 1         |                   |
+| 4         | pad_left      | int   | 0         |                   |
+| 11        | kernel_h      | int   | kernel_w  |                   |
+| 12        | dilation_h    | int   | dilation_w |                  |
+| 13        | stride_h      | int   | stride_w  |                   |
+| 14        | pad_top       | int   | pad_left  |                   |
+| 15        | pad_right     | int   | pad_left  |                   |
+| 16        | pad_bottom    | int   | pad_top   |                   |
+| 20        | output_w      | int   | 0         |                   |
+| 21        | output_h      | int   | output_w  |                   |
+
 # GELU
 ```
 if fast_gelu == 1   y = 0.5 * x * (1 + tanh(0.79788452 * (x + 0.044715 * x * x * x)));
@@ -1187,6 +1233,7 @@ y = data
 | 1         | h             | int   | 0         |                   |
 | 11        | d             | int   | 0         |                   |
 | 2         | c             | int   | 0         |                   |
+| 21        | load_type     | int   | 1         | 1=fp32            |
 
 | weight        | type  | shape                 |
 | ------------- | ----- | --------------------- |
@@ -1537,6 +1584,7 @@ y = reduce_op(x * coeff)
 | 2         | coeff         | float | 1.f       |                   |
 | 3         | axes          | array | [ ]       |                   |
 | 4         | keepdims      | int   | 0         |                   |
+| 5         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
 
 Operation type:
 - 0 = SUM
@@ -1829,3 +1877,24 @@ Operation type:
 - 17 = LOG10
 - 18 = ROUND
 - 19 = TRUNC
+
+# Unfold
+```
+y = unfold(x)
+```
+
+* one_blob_only
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | num_output    | int   | 0         |                   |
+| 1         | kernel_w      | int   | 0         |                   |
+| 2         | dilation_w    | int   | 1         |                   |
+| 3         | stride_w      | int   | 1         |                   |
+| 4         | pad_left      | int   | 0         |                   |
+| 11        | kernel_h      | int   | kernel_w  |                   |
+| 12        | dilation_h    | int   | dilation_w |                  |
+| 13        | stride_h      | int   | stride_w  |                   |
+| 14        | pad_top       | int   | pad_left  |                   |
+| 15        | pad_right     | int   | pad_left  |                   |
+| 16        | pad_bottom    | int   | pad_top   |                   |
diff --git a/src/layer/arm/convolution1d_arm.cpp b/src/layer/arm/convolution1d_arm.cpp
@@ -68,7 +68,8 @@ int Convolution1D_arm::create_pipeline(const Option& opt)
 
     convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }
@@ -233,13 +234,14 @@ int Convolution1D_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector
 }
 
 #if NCNN_BF16
-int Convolution1D_arm::create_pipeline_bf16s(const Option& /*opt*/)
+int Convolution1D_arm::create_pipeline_bf16s(const Option& opt)
 {
     const int num_input = weight_data_size / kernel_w / num_output;
 
     convolution1d_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w);
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }

diff --git a/src/layer/arm/convolution1d_arm_asimdhp.cpp b/src/layer/arm/convolution1d_arm_asimdhp.cpp
@@ -36,7 +36,8 @@ int Convolution1D_arm::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }

diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
@@ -194,7 +194,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
 
         convolution_dilation1->create_pipeline(opt);
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
@@ -224,7 +225,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
@@ -270,7 +272,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
     {
         convolution_im2col_gemm_transform_kernel(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
@@ -305,7 +308,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
         convolution_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
     }
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }
@@ -904,7 +908,8 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
@@ -950,7 +955,8 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
     {
         convolution_im2col_gemm_transform_kernel_bf16s(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
@@ -971,7 +977,8 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
         convolution_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
     }
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }
@@ -1284,7 +1291,8 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }

diff --git a/src/layer/arm/convolution_arm_asimdhp.cpp b/src/layer/arm/convolution_arm_asimdhp.cpp
@@ -108,7 +108,8 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel_fp16sa(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         if (opt.use_fp16_arithmetic)
         {
@@ -189,7 +190,8 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
@@ -219,7 +221,8 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
     }
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }

diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -119,7 +119,8 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
                 ncnn::cast_float32_to_bfloat16(weight_data, weight_data_tm, opt);
             }
 
-            weight_data.release();
+            if (opt.lightmode)
+                weight_data.release();
 
             return 0;
         }
@@ -161,15 +162,17 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
             }
         }
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }
@@ -1022,15 +1025,17 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }

diff --git a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
@@ -76,15 +76,17 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        weight_data.release();
+        if (opt.lightmode)
+            weight_data.release();
 
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }

diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp
@@ -211,7 +211,8 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
         }
     }
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }
@@ -954,7 +955,8 @@ int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }

diff --git a/src/layer/arm/deconvolution_arm_asimdhp.cpp b/src/layer/arm/deconvolution_arm_asimdhp.cpp
@@ -154,7 +154,8 @@ int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }

diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -104,7 +104,8 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
                 ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_tm, opt);
             }
 
-            weight_data.release();
+            if (opt.lightmode)
+                weight_data.release();
 
             return 0;
         }
@@ -190,7 +191,8 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
         }
     }
 
-    weight_data.release();
+    if (opt.lightmode)
+        weight_data.release();
 
     return 0;
 }