From cb674ac5eddb32f0709a60c81f71d2cbc6bc89da Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Mon, 21 Aug 2023 10:48:45 +0800
Subject: [PATCH] fix build with toolchain defined _L _U constants (#4957)

---
 src/layer/arm/gru_arm.cpp                     | 160 +++++++++---------
 src/layer/arm/gru_arm_asimdhp.cpp             | 104 ++++++------
 src/layer/arm/interp_bicubic_pack4.h          |  10 +-
 src/layer/arm/interp_bicubic_pack4_bf16s.h    |  10 +-
 src/layer/arm/interp_bicubic_pack4_fp16s.h    |  20 +--
 src/layer/arm/interp_bicubic_pack8_fp16s.h    |  10 +-
 src/layer/arm/interp_bilinear.h               |  12 +-
 src/layer/arm/interp_bilinear_bf16s.h         |  12 +-
 src/layer/arm/interp_bilinear_fp16s.h         |  12 +-
 src/layer/arm/interp_bilinear_pack4.h         |   6 +-
 src/layer/arm/interp_bilinear_pack4_bf16s.h   |   6 +-
 src/layer/arm/interp_bilinear_pack4_fp16s.h   |  12 +-
 src/layer/arm/interp_bilinear_pack8_fp16s.h   |   6 +-
 src/layer/arm/lstm_arm.cpp                    |  36 ++--
 src/layer/arm/lstm_arm_asimdhp.cpp            |  36 ++--
 src/layer/arm/rnn_arm.cpp                     |  56 +++---
 src/layer/arm/rnn_arm_asimdhp.cpp             |  64 +++----
 src/layer/loongarch/interp_bicubic_pack4.h    |  10 +-
 src/layer/loongarch/interp_bilinear.h         |  12 +-
 src/layer/loongarch/interp_bilinear_pack4.h   |   6 +-
 src/layer/mips/interp_bicubic_pack4.h         |  10 +-
 src/layer/mips/interp_bilinear.h              |  12 +-
 src/layer/mips/interp_bilinear_pack4.h        |   6 +-
 src/layer/riscv/interp_bicubic_packn.h        |   4 +-
 src/layer/riscv/interp_bicubic_packn_fp16s.h  |   8 +-
 src/layer/riscv/interp_bilinear.h             |   4 +-
 src/layer/riscv/interp_bilinear_fp16s.h       |   8 +-
 src/layer/riscv/interp_bilinear_packn.h       |   4 +-
 src/layer/riscv/interp_bilinear_packn_fp16s.h |   8 +-
 src/layer/x86/interp_bicubic.h                |  20 +--
 src/layer/x86/interp_bicubic_pack16.h         |  10 +-
 src/layer/x86/interp_bicubic_pack4.h          |  10 +-
 src/layer/x86/interp_bicubic_pack8.h          |  10 +-
 src/layer/x86/interp_bilinear.h               |  12 +-
 src/layer/x86/interp_bilinear_pack16.h        |   6 +-
 src/layer/x86/interp_bilinear_pack4.h         |   6 +-
 src/layer/x86/interp_bilinear_pack8.h         |   6 +-
 src/layer/x86/lstm_x86.cpp                    |  18 +-
 src/mat_pixel_resize.cpp                      |  16 +-
 tools/pnnx/tests/test_torch_max.py            |  12 +-
 tools/pnnx/tests/test_torch_min.py            |  12 +-
 41 files changed, 403 insertions(+), 399 deletions(-)

diff --git a/src/layer/arm/gru_arm.cpp b/src/layer/arm/gru_arm.cpp
index b0227bf1f53..aa927d26a58 100644
--- a/src/layer/arm/gru_arm.cpp
+++ b/src/layer/arm/gru_arm.cpp
@@ -294,8 +294,8 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* weight_xc_RUN = weight_xc.row(q / 4);
             const float* weight_hc_RUN = weight_hc.row(q / 4);
 
-            float32x4_t _R = vld1q_f32(bias_c_RUBNWN);
-            float32x4_t _U = vld1q_f32(bias_c_RUBNWN + 4);
+            float32x4_t _gru_R = vld1q_f32(bias_c_RUBNWN);
+            float32x4_t _gru_U = vld1q_f32(bias_c_RUBNWN + 4);
             float32x4_t _sum1 = vdupq_n_f32(0.f);
             float32x4_t _sum2 = vdupq_n_f32(0.f);
             float32x4_t _sum3 = vdupq_n_f32(0.f);
@@ -316,8 +316,8 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 float32x4_t _weight_xc_R_3 = vld1q_f32(weight_xc_RUN + 24);
                 float32x4_t _weight_xc_U_3 = vld1q_f32(weight_xc_RUN + 28);
 #if __aarch64__
-                _R = vfmaq_laneq_f32(_R, _weight_xc_R, _xi, 0);
-                _U = vfmaq_laneq_f32(_U, _weight_xc_U, _xi, 0);
+                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_xc_R, _xi, 0);
+                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_xc_U, _xi, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_R_1, _xi, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_U_1, _xi, 1);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_R_2, _xi, 2);
@@ -325,8 +325,8 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 _sum5 = vfmaq_laneq_f32(_sum5, _weight_xc_R_3, _xi, 3);
                 _sum6 = vfmaq_laneq_f32(_sum6, _weight_xc_U_3, _xi, 3);
 #else
-                _R = vmlaq_lane_f32(_R, _weight_xc_R, vget_low_f32(_xi), 0);
-                _U = vmlaq_lane_f32(_U, _weight_xc_U, vget_low_f32(_xi), 0);
+                _gru_R = vmlaq_lane_f32(_gru_R, _weight_xc_R, vget_low_f32(_xi), 0);
+                _gru_U = vmlaq_lane_f32(_gru_U, _weight_xc_U, vget_low_f32(_xi), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_R_1, vget_low_f32(_xi), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_U_1, vget_low_f32(_xi), 1);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_R_2, vget_high_f32(_xi), 0);
@@ -344,8 +344,8 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 float32x4_t _xi = vdupq_n_f32(xi);
                 float32x4_t _weight_xc_R = vld1q_f32(weight_xc_RUN);
                 float32x4_t _weight_xc_U = vld1q_f32(weight_xc_RUN + 4);
-                _R = vmlaq_f32(_R, _weight_xc_R, _xi);
-                _U = vmlaq_f32(_U, _weight_xc_U, _xi);
+                _gru_R = vmlaq_f32(_gru_R, _weight_xc_R, _xi);
+                _gru_U = vmlaq_f32(_gru_U, _weight_xc_U, _xi);
 
                 weight_xc_RUN += 8;
             }
@@ -363,8 +363,8 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 float32x4_t _weight_hc_R_3 = vld1q_f32(weight_hc_RUN + 24);
                 float32x4_t _weight_hc_U_3 = vld1q_f32(weight_hc_RUN + 28);
 #if __aarch64__
-                _R = vfmaq_laneq_f32(_R, _weight_hc_R, _h_cont, 0);
-                _U = vfmaq_laneq_f32(_U, _weight_hc_U, _h_cont, 0);
+                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_hc_R, _h_cont, 0);
+                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_hc_U, _h_cont, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_R_1, _h_cont, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_U_1, _h_cont, 1);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_R_2, _h_cont, 2);
@@ -372,8 +372,8 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 _sum5 = vfmaq_laneq_f32(_sum5, _weight_hc_R_3, _h_cont, 3);
                 _sum6 = vfmaq_laneq_f32(_sum6, _weight_hc_U_3, _h_cont, 3);
 #else
-                _R = vmlaq_lane_f32(_R, _weight_hc_R, vget_low_f32(_h_cont), 0);
-                _U = vmlaq_lane_f32(_U, _weight_hc_U, vget_low_f32(_h_cont), 0);
+                _gru_R = vmlaq_lane_f32(_gru_R, _weight_hc_R, vget_low_f32(_h_cont), 0);
+                _gru_U = vmlaq_lane_f32(_gru_U, _weight_hc_U, vget_low_f32(_h_cont), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_R_1, vget_low_f32(_h_cont), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_U_1, vget_low_f32(_h_cont), 1);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_R_2, vget_high_f32(_h_cont), 0);
@@ -391,26 +391,26 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 float32x4_t _h_cont = vdupq_n_f32(h_cont);
                 float32x4_t _weight_hc_R = vld1q_f32(weight_hc_RUN);
                 float32x4_t _weight_hc_U = vld1q_f32(weight_hc_RUN + 4);
-                _R = vmlaq_f32(_R, _weight_hc_R, _h_cont);
-                _U = vmlaq_f32(_U, _weight_hc_U, _h_cont);
+                _gru_R = vmlaq_f32(_gru_R, _weight_hc_R, _h_cont);
+                _gru_U = vmlaq_f32(_gru_U, _weight_hc_U, _h_cont);
 
                 weight_hc_RUN += 8;
             }
 
-            _R = vaddq_f32(_R, _sum1);
-            _U = vaddq_f32(_U, _sum2);
+            _gru_R = vaddq_f32(_gru_R, _sum1);
+            _gru_U = vaddq_f32(_gru_U, _sum2);
             _sum3 = vaddq_f32(_sum3, _sum5);
             _sum4 = vaddq_f32(_sum4, _sum6);
-            _R = vaddq_f32(_R, _sum3);
-            _U = vaddq_f32(_U, _sum4);
+            _gru_R = vaddq_f32(_gru_R, _sum3);
+            _gru_U = vaddq_f32(_gru_U, _sum4);
 
             // sigmoid(R)
             // sigmoid(U)
-            _R = sigmoid_ps(_R);
-            _U = sigmoid_ps(_U);
+            _gru_R = sigmoid_ps(_gru_R);
+            _gru_U = sigmoid_ps(_gru_U);
 
             // gate new
-            float32x4_t _N = vld1q_f32(bias_c_RUBNWN + 8);
+            float32x4_t _gru_N = vld1q_f32(bias_c_RUBNWN + 8);
             _sum1 = vdupq_n_f32(0.f);
             _sum2 = vdupq_n_f32(0.f);
             _sum3 = vdupq_n_f32(0.f);
@@ -424,12 +424,12 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 float32x4_t _weight_hc_N_2 = vld1q_f32(weight_hc_RUN + 8);
                 float32x4_t _weight_hc_N_3 = vld1q_f32(weight_hc_RUN + 12);
 #if __aarch64__
-                _N = vfmaq_laneq_f32(_N, _weight_hc_N, _h_cont, 0);
+                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_hc_N, _h_cont, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_N_1, _h_cont, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_N_2, _h_cont, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_N_3, _h_cont, 3);
 #else
-                _N = vmlaq_lane_f32(_N, _weight_hc_N, vget_low_f32(_h_cont), 0);
+                _gru_N = vmlaq_lane_f32(_gru_N, _weight_hc_N, vget_low_f32(_h_cont), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_N_1, vget_low_f32(_h_cont), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_N_2, vget_high_f32(_h_cont), 0);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_N_3, vget_high_f32(_h_cont), 1);
@@ -443,16 +443,16 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
 
                 float32x4_t _h_cont = vdupq_n_f32(h_cont);
                 float32x4_t _weight_hc_N = vld1q_f32(weight_hc_RUN);
-                _N = vmlaq_f32(_N, _weight_hc_N, _h_cont);
+                _gru_N = vmlaq_f32(_gru_N, _weight_hc_N, _h_cont);
 
                 weight_hc_RUN += 4;
             }
 
-            _N = vaddq_f32(_N, _sum1);
+            _gru_N = vaddq_f32(_gru_N, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _N = vaddq_f32(_N, _sum2);
+            _gru_N = vaddq_f32(_gru_N, _sum2);
 
-            _N = vmlaq_f32(vld1q_f32(bias_c_RUBNWN + 12), _R, _N);
+            _gru_N = vmlaq_f32(vld1q_f32(bias_c_RUBNWN + 12), _gru_R, _gru_N);
             _sum1 = vdupq_n_f32(0.f);
             _sum2 = vdupq_n_f32(0.f);
             _sum3 = vdupq_n_f32(0.f);
@@ -466,12 +466,12 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 float32x4_t _weight_xc_N_2 = vld1q_f32(weight_xc_RUN + 8);
                 float32x4_t _weight_xc_N_3 = vld1q_f32(weight_xc_RUN + 12);
 #if __aarch64__
-                _N = vfmaq_laneq_f32(_N, _weight_xc_N, _xi, 0);
+                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_xc_N, _xi, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_N_1, _xi, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_N_2, _xi, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_N_3, _xi, 3);
 #else
-                _N = vmlaq_lane_f32(_N, _weight_xc_N, vget_low_f32(_xi), 0);
+                _gru_N = vmlaq_lane_f32(_gru_N, _weight_xc_N, vget_low_f32(_xi), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_N_1, vget_low_f32(_xi), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_N_2, vget_high_f32(_xi), 0);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_N_3, vget_high_f32(_xi), 1);
@@ -485,22 +485,22 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
 
                 float32x4_t _xi = vdupq_n_f32(xi);
                 float32x4_t _weight_xc_N = vld1q_f32(weight_xc_RUN);
-                _N = vmlaq_f32(_N, _weight_xc_N, _xi);
+                _gru_N = vmlaq_f32(_gru_N, _weight_xc_N, _xi);
 
                 weight_xc_RUN += 4;
             }
 
-            _N = vaddq_f32(_N, _sum1);
+            _gru_N = vaddq_f32(_gru_N, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _N = vaddq_f32(_N, _sum2);
+            _gru_N = vaddq_f32(_gru_N, _sum2);
 
             // tanh(N)
-            _N = tanh_ps(_N);
+            _gru_N = tanh_ps(_gru_N);
 
             float* gates_data = gates.row(q / 4);
 
-            vst1q_f32(gates_data, _U);
-            vst1q_f32(gates_data + 4, _N);
+            vst1q_f32(gates_data, _gru_U);
+            vst1q_f32(gates_data + 4, _gru_N);
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -599,13 +599,13 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
 
             const float* gates_data = gates.row(q / 4);
 
-            float32x4_t _U = vld1q_f32(gates_data);
-            float32x4_t _N = vld1q_f32(gates_data + 4);
+            float32x4_t _gru_U = vld1q_f32(gates_data);
+            float32x4_t _gru_N = vld1q_f32(gates_data + 4);
 
-            float32x4_t _H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _U), _N), vmulq_f32(_U, vld1q_f32(hidden_ptr + q)));
+            float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));
 
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1q_f32(output_data + q, _H);
+            vst1q_f32(hidden_ptr + q, _gru_H);
+            vst1q_f32(output_data + q, _gru_H);
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -836,8 +836,8 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const unsigned short* weight_xc_RUN = weight_xc.row<const unsigned short>(q / 4);
             const unsigned short* weight_hc_RUN = weight_hc.row<const unsigned short>(q / 4);
 
-            float32x4_t _R = bfloat2float(vld1_u16(bias_c_RUBNWN));
-            float32x4_t _U = bfloat2float(vld1_u16(bias_c_RUBNWN + 4));
+            float32x4_t _gru_R = bfloat2float(vld1_u16(bias_c_RUBNWN));
+            float32x4_t _gru_U = bfloat2float(vld1_u16(bias_c_RUBNWN + 4));
             float32x4_t _sum1 = vdupq_n_f32(0.f);
             float32x4_t _sum2 = vdupq_n_f32(0.f);
             float32x4_t _sum3 = vdupq_n_f32(0.f);
@@ -858,8 +858,8 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_xc_R_3 = bfloat2float(vld1_u16(weight_xc_RUN + 24));
                 float32x4_t _weight_xc_U_3 = bfloat2float(vld1_u16(weight_xc_RUN + 28));
 #if __aarch64__
-                _R = vfmaq_laneq_f32(_R, _weight_xc_R, _xi, 0);
-                _U = vfmaq_laneq_f32(_U, _weight_xc_U, _xi, 0);
+                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_xc_R, _xi, 0);
+                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_xc_U, _xi, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_R_1, _xi, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_U_1, _xi, 1);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_R_2, _xi, 2);
@@ -867,8 +867,8 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 _sum5 = vfmaq_laneq_f32(_sum5, _weight_xc_R_3, _xi, 3);
                 _sum6 = vfmaq_laneq_f32(_sum6, _weight_xc_U_3, _xi, 3);
 #else
-                _R = vmlaq_lane_f32(_R, _weight_xc_R, vget_low_f32(_xi), 0);
-                _U = vmlaq_lane_f32(_U, _weight_xc_U, vget_low_f32(_xi), 0);
+                _gru_R = vmlaq_lane_f32(_gru_R, _weight_xc_R, vget_low_f32(_xi), 0);
+                _gru_U = vmlaq_lane_f32(_gru_U, _weight_xc_U, vget_low_f32(_xi), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_R_1, vget_low_f32(_xi), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_U_1, vget_low_f32(_xi), 1);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_R_2, vget_high_f32(_xi), 0);
@@ -886,8 +886,8 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _xi = bfloat2float(vdup_n_u16(xi));
                 float32x4_t _weight_xc_R = bfloat2float(vld1_u16(weight_xc_RUN));
                 float32x4_t _weight_xc_U = bfloat2float(vld1_u16(weight_xc_RUN + 4));
-                _R = vmlaq_f32(_R, _weight_xc_R, _xi);
-                _U = vmlaq_f32(_U, _weight_xc_U, _xi);
+                _gru_R = vmlaq_f32(_gru_R, _weight_xc_R, _xi);
+                _gru_U = vmlaq_f32(_gru_U, _weight_xc_U, _xi);
 
                 weight_xc_RUN += 8;
             }
@@ -905,8 +905,8 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_hc_R_3 = bfloat2float(vld1_u16(weight_hc_RUN + 24));
                 float32x4_t _weight_hc_U_3 = bfloat2float(vld1_u16(weight_hc_RUN + 28));
 #if __aarch64__
-                _R = vfmaq_laneq_f32(_R, _weight_hc_R, _h_cont, 0);
-                _U = vfmaq_laneq_f32(_U, _weight_hc_U, _h_cont, 0);
+                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_hc_R, _h_cont, 0);
+                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_hc_U, _h_cont, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_R_1, _h_cont, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_U_1, _h_cont, 1);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_R_2, _h_cont, 2);
@@ -914,8 +914,8 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 _sum5 = vfmaq_laneq_f32(_sum5, _weight_hc_R_3, _h_cont, 3);
                 _sum6 = vfmaq_laneq_f32(_sum6, _weight_hc_U_3, _h_cont, 3);
 #else
-                _R = vmlaq_lane_f32(_R, _weight_hc_R, vget_low_f32(_h_cont), 0);
-                _U = vmlaq_lane_f32(_U, _weight_hc_U, vget_low_f32(_h_cont), 0);
+                _gru_R = vmlaq_lane_f32(_gru_R, _weight_hc_R, vget_low_f32(_h_cont), 0);
+                _gru_U = vmlaq_lane_f32(_gru_U, _weight_hc_U, vget_low_f32(_h_cont), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_R_1, vget_low_f32(_h_cont), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_U_1, vget_low_f32(_h_cont), 1);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_R_2, vget_high_f32(_h_cont), 0);
@@ -933,26 +933,26 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _h_cont = vdupq_n_f32(h_cont);
                 float32x4_t _weight_hc_R = bfloat2float(vld1_u16(weight_hc_RUN));
                 float32x4_t _weight_hc_U = bfloat2float(vld1_u16(weight_hc_RUN + 4));
-                _R = vmlaq_f32(_R, _weight_hc_R, _h_cont);
-                _U = vmlaq_f32(_U, _weight_hc_U, _h_cont);
+                _gru_R = vmlaq_f32(_gru_R, _weight_hc_R, _h_cont);
+                _gru_U = vmlaq_f32(_gru_U, _weight_hc_U, _h_cont);
 
                 weight_hc_RUN += 8;
             }
 
-            _R = vaddq_f32(_R, _sum1);
-            _U = vaddq_f32(_U, _sum2);
+            _gru_R = vaddq_f32(_gru_R, _sum1);
+            _gru_U = vaddq_f32(_gru_U, _sum2);
             _sum3 = vaddq_f32(_sum3, _sum5);
             _sum4 = vaddq_f32(_sum4, _sum6);
-            _R = vaddq_f32(_R, _sum3);
-            _U = vaddq_f32(_U, _sum4);
+            _gru_R = vaddq_f32(_gru_R, _sum3);
+            _gru_U = vaddq_f32(_gru_U, _sum4);
 
             // sigmoid(R)
             // sigmoid(U)
-            _R = sigmoid_ps(_R);
-            _U = sigmoid_ps(_U);
+            _gru_R = sigmoid_ps(_gru_R);
+            _gru_U = sigmoid_ps(_gru_U);
 
             // gate new
-            float32x4_t _N = bfloat2float(vld1_u16(bias_c_RUBNWN + 8));
+            float32x4_t _gru_N = bfloat2float(vld1_u16(bias_c_RUBNWN + 8));
             _sum1 = vdupq_n_f32(0.f);
             _sum2 = vdupq_n_f32(0.f);
             _sum3 = vdupq_n_f32(0.f);
@@ -966,12 +966,12 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_hc_N_2 = bfloat2float(vld1_u16(weight_hc_RUN + 8));
                 float32x4_t _weight_hc_N_3 = bfloat2float(vld1_u16(weight_hc_RUN + 12));
 #if __aarch64__
-                _N = vfmaq_laneq_f32(_N, _weight_hc_N, _h_cont, 0);
+                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_hc_N, _h_cont, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_N_1, _h_cont, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_N_2, _h_cont, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_N_3, _h_cont, 3);
 #else
-                _N = vmlaq_lane_f32(_N, _weight_hc_N, vget_low_f32(_h_cont), 0);
+                _gru_N = vmlaq_lane_f32(_gru_N, _weight_hc_N, vget_low_f32(_h_cont), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_N_1, vget_low_f32(_h_cont), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_N_2, vget_high_f32(_h_cont), 0);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_N_3, vget_high_f32(_h_cont), 1);
@@ -985,16 +985,16 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
 
                 float32x4_t _h_cont = vdupq_n_f32(h_cont);
                 float32x4_t _weight_hc_N = bfloat2float(vld1_u16(weight_hc_RUN));
-                _N = vmlaq_f32(_N, _weight_hc_N, _h_cont);
+                _gru_N = vmlaq_f32(_gru_N, _weight_hc_N, _h_cont);
 
                 weight_hc_RUN += 4;
             }
 
-            _N = vaddq_f32(_N, _sum1);
+            _gru_N = vaddq_f32(_gru_N, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _N = vaddq_f32(_N, _sum2);
+            _gru_N = vaddq_f32(_gru_N, _sum2);
 
-            _N = vmlaq_f32(bfloat2float(vld1_u16(bias_c_RUBNWN + 12)), _R, _N);
+            _gru_N = vmlaq_f32(bfloat2float(vld1_u16(bias_c_RUBNWN + 12)), _gru_R, _gru_N);
             _sum1 = vdupq_n_f32(0.f);
             _sum2 = vdupq_n_f32(0.f);
             _sum3 = vdupq_n_f32(0.f);
@@ -1008,12 +1008,12 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_xc_N_2 = bfloat2float(vld1_u16(weight_xc_RUN + 8));
                 float32x4_t _weight_xc_N_3 = bfloat2float(vld1_u16(weight_xc_RUN + 12));
 #if __aarch64__
-                _N = vfmaq_laneq_f32(_N, _weight_xc_N, _xi, 0);
+                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_xc_N, _xi, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_N_1, _xi, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_N_2, _xi, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_N_3, _xi, 3);
 #else
-                _N = vmlaq_lane_f32(_N, _weight_xc_N, vget_low_f32(_xi), 0);
+                _gru_N = vmlaq_lane_f32(_gru_N, _weight_xc_N, vget_low_f32(_xi), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_N_1, vget_low_f32(_xi), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_N_2, vget_high_f32(_xi), 0);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_N_3, vget_high_f32(_xi), 1);
@@ -1027,22 +1027,22 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
 
                 float32x4_t _xi = bfloat2float(vdup_n_u16(xi));
                 float32x4_t _weight_xc_N = bfloat2float(vld1_u16(weight_xc_RUN));
-                _N = vmlaq_f32(_N, _weight_xc_N, _xi);
+                _gru_N = vmlaq_f32(_gru_N, _weight_xc_N, _xi);
 
                 weight_xc_RUN += 4;
             }
 
-            _N = vaddq_f32(_N, _sum1);
+            _gru_N = vaddq_f32(_gru_N, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _N = vaddq_f32(_N, _sum2);
+            _gru_N = vaddq_f32(_gru_N, _sum2);
 
             // tanh(N)
-            _N = tanh_ps(_N);
+            _gru_N = tanh_ps(_gru_N);
 
             float* gates_data = gates.row(q / 4);
 
-            vst1q_f32(gates_data, _U);
-            vst1q_f32(gates_data + 4, _N);
+            vst1q_f32(gates_data, _gru_U);
+            vst1q_f32(gates_data + 4, _gru_N);
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -1141,13 +1141,13 @@ static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
 
             const float* gates_data = gates.row(q / 4);
 
-            float32x4_t _U = vld1q_f32(gates_data);
-            float32x4_t _N = vld1q_f32(gates_data + 4);
+            float32x4_t _gru_U = vld1q_f32(gates_data);
+            float32x4_t _gru_N = vld1q_f32(gates_data + 4);
 
-            float32x4_t _H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _U), _N), vmulq_f32(_U, vld1q_f32(hidden_ptr + q)));
+            float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));
 
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_u16(output_data + q, float2bfloat(_H));
+            vst1q_f32(hidden_ptr + q, _gru_H);
+            vst1_u16(output_data + q, float2bfloat(_gru_H));
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
diff --git a/src/layer/arm/gru_arm_asimdhp.cpp b/src/layer/arm/gru_arm_asimdhp.cpp
index 33ae1355ff8..f5e74b50284 100644
--- a/src/layer/arm/gru_arm_asimdhp.cpp
+++ b/src/layer/arm/gru_arm_asimdhp.cpp
@@ -57,8 +57,8 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const __fp16* weight_xc_RUN = weight_xc.row<const __fp16>(q / 4);
             const __fp16* weight_hc_RUN = weight_hc.row<const __fp16>(q / 4);
 
-            float32x4_t _R = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN));
-            float32x4_t _U = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 4));
+            float32x4_t _gru_R = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN));
+            float32x4_t _gru_U = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 4));
             float32x4_t _sum1 = vdupq_n_f32(0.f);
             float32x4_t _sum2 = vdupq_n_f32(0.f);
             float32x4_t _sum3 = vdupq_n_f32(0.f);
@@ -78,8 +78,8 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_xc_U_2 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 20));
                 float32x4_t _weight_xc_R_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 24));
                 float32x4_t _weight_xc_U_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 28));
-                _R = vfmaq_laneq_f32(_R, _weight_xc_R, _xi, 0);
-                _U = vfmaq_laneq_f32(_U, _weight_xc_U, _xi, 0);
+                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_xc_R, _xi, 0);
+                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_xc_U, _xi, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_R_1, _xi, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_U_1, _xi, 1);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_R_2, _xi, 2);
@@ -96,8 +96,8 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _xi = vcvt_f32_f16(vdup_n_f16(xi));
                 float32x4_t _weight_xc_R = vcvt_f32_f16(vld1_f16(weight_xc_RUN));
                 float32x4_t _weight_xc_U = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 4));
-                _R = vmlaq_f32(_R, _weight_xc_R, _xi);
-                _U = vmlaq_f32(_U, _weight_xc_U, _xi);
+                _gru_R = vmlaq_f32(_gru_R, _weight_xc_R, _xi);
+                _gru_U = vmlaq_f32(_gru_U, _weight_xc_U, _xi);
 
                 weight_xc_RUN += 8;
             }
@@ -114,8 +114,8 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_hc_U_2 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 20));
                 float32x4_t _weight_hc_R_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 24));
                 float32x4_t _weight_hc_U_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 28));
-                _R = vfmaq_laneq_f32(_R, _weight_hc_R, _h_cont, 0);
-                _U = vfmaq_laneq_f32(_U, _weight_hc_U, _h_cont, 0);
+                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_hc_R, _h_cont, 0);
+                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_hc_U, _h_cont, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_R_1, _h_cont, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_U_1, _h_cont, 1);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_R_2, _h_cont, 2);
@@ -132,26 +132,26 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _h_cont = vdupq_n_f32(h_cont);
                 float32x4_t _weight_hc_R = vcvt_f32_f16(vld1_f16(weight_hc_RUN));
                 float32x4_t _weight_hc_U = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 4));
-                _R = vmlaq_f32(_R, _weight_hc_R, _h_cont);
-                _U = vmlaq_f32(_U, _weight_hc_U, _h_cont);
+                _gru_R = vmlaq_f32(_gru_R, _weight_hc_R, _h_cont);
+                _gru_U = vmlaq_f32(_gru_U, _weight_hc_U, _h_cont);
 
                 weight_hc_RUN += 8;
             }
 
-            _R = vaddq_f32(_R, _sum1);
-            _U = vaddq_f32(_U, _sum2);
+            _gru_R = vaddq_f32(_gru_R, _sum1);
+            _gru_U = vaddq_f32(_gru_U, _sum2);
             _sum3 = vaddq_f32(_sum3, _sum5);
             _sum4 = vaddq_f32(_sum4, _sum6);
-            _R = vaddq_f32(_R, _sum3);
-            _U = vaddq_f32(_U, _sum4);
+            _gru_R = vaddq_f32(_gru_R, _sum3);
+            _gru_U = vaddq_f32(_gru_U, _sum4);
 
             // sigmoid(R)
             // sigmoid(U)
-            _R = sigmoid_ps(_R);
-            _U = sigmoid_ps(_U);
+            _gru_R = sigmoid_ps(_gru_R);
+            _gru_U = sigmoid_ps(_gru_U);
 
             // gate new
-            float32x4_t _N = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 8));
+            float32x4_t _gru_N = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 8));
             _sum1 = vdupq_n_f32(0.f);
             _sum2 = vdupq_n_f32(0.f);
             _sum3 = vdupq_n_f32(0.f);
@@ -164,7 +164,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_hc_N_1 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 4));
                 float32x4_t _weight_hc_N_2 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 8));
                 float32x4_t _weight_hc_N_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 12));
-                _N = vfmaq_laneq_f32(_N, _weight_hc_N, _h_cont, 0);
+                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_hc_N, _h_cont, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_N_1, _h_cont, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_N_2, _h_cont, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_N_3, _h_cont, 3);
@@ -177,16 +177,16 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
 
                 float32x4_t _h_cont = vdupq_n_f32(h_cont);
                 float32x4_t _weight_hc_N = vcvt_f32_f16(vld1_f16(weight_hc_RUN));
-                _N = vmlaq_f32(_N, _weight_hc_N, _h_cont);
+                _gru_N = vmlaq_f32(_gru_N, _weight_hc_N, _h_cont);
 
                 weight_hc_RUN += 4;
             }
 
-            _N = vaddq_f32(_N, _sum1);
+            _gru_N = vaddq_f32(_gru_N, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _N = vaddq_f32(_N, _sum2);
+            _gru_N = vaddq_f32(_gru_N, _sum2);
 
-            _N = vmlaq_f32(vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 12)), _R, _N);
+            _gru_N = vmlaq_f32(vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 12)), _gru_R, _gru_N);
             _sum1 = vdupq_n_f32(0.f);
             _sum2 = vdupq_n_f32(0.f);
             _sum3 = vdupq_n_f32(0.f);
@@ -199,7 +199,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_xc_N_1 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 4));
                 float32x4_t _weight_xc_N_2 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 8));
                 float32x4_t _weight_xc_N_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 12));
-                _N = vfmaq_laneq_f32(_N, _weight_xc_N, _xi, 0);
+                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_xc_N, _xi, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_N_1, _xi, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_N_2, _xi, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_N_3, _xi, 3);
@@ -212,22 +212,22 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
 
                 float32x4_t _xi = vcvt_f32_f16(vdup_n_f16(xi));
                 float32x4_t _weight_xc_N = vcvt_f32_f16(vld1_f16(weight_xc_RUN));
-                _N = vmlaq_f32(_N, _weight_xc_N, _xi);
+                _gru_N = vmlaq_f32(_gru_N, _weight_xc_N, _xi);
 
                 weight_xc_RUN += 4;
             }
 
-            _N = vaddq_f32(_N, _sum1);
+            _gru_N = vaddq_f32(_gru_N, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _N = vaddq_f32(_N, _sum2);
+            _gru_N = vaddq_f32(_gru_N, _sum2);
 
             // tanh(N)
-            _N = tanh_ps(_N);
+            _gru_N = tanh_ps(_gru_N);
 
             float* gates_data = gates.row(q / 4);
 
-            vst1q_f32(gates_data, _U);
-            vst1q_f32(gates_data + 4, _N);
+            vst1q_f32(gates_data, _gru_U);
+            vst1q_f32(gates_data + 4, _gru_N);
         }
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = remain_num_output_start; q < num_output; q++)
@@ -314,13 +314,13 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
 
             const float* gates_data = gates.row(q / 4);
 
-            float32x4_t _U = vld1q_f32(gates_data);
-            float32x4_t _N = vld1q_f32(gates_data + 4);
+            float32x4_t _gru_U = vld1q_f32(gates_data);
+            float32x4_t _gru_N = vld1q_f32(gates_data + 4);
 
-            float32x4_t _H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _U), _N), vmulq_f32(_U, vld1q_f32(hidden_ptr + q)));
+            float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));
 
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_f16(output_data + q, vcvt_f16_f32(_H));
+            vst1q_f32(hidden_ptr + q, _gru_H);
+            vst1_f16(output_data + q, vcvt_f16_f32(_gru_H));
         }
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = remain_num_output_start; q < num_output; q++)
@@ -463,7 +463,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             hidden_ptr = hidden_state;
 
             // gate new
-            float16x4_t _N = vld1_f16(bias_c_RUBNWN + 8);
+            float16x4_t _gru_N = vld1_f16(bias_c_RUBNWN + 8);
             float16x4_t _sum4 = vdup_n_f16((__fp16)0.f);
             float16x4_t _sum5 = vdup_n_f16((__fp16)0.f);
             float16x4_t _sum6 = vdup_n_f16((__fp16)0.f);
@@ -481,13 +481,13 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                     "fmla   %5.4h, v3.4h, v4.h[3]   \n"
                     : "=r"(hidden_ptr),
                     "=r"(weight_hc_RUN),
-                    "=w"(_N),
+                    "=w"(_gru_N),
                     "=w"(_sum4),
                     "=w"(_sum5),
                     "=w"(_sum6)
                     : "0"(hidden_ptr),
                     "1"(weight_hc_RUN),
-                    "2"(_N),
+                    "2"(_gru_N),
                     "3"(_sum4),
                     "4"(_sum5),
                     "5"(_sum6)
@@ -499,16 +499,16 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
                 float16x4_t _h_cont = vdup_n_f16((__fp16)h_cont);
                 float16x4_t _weight_hc_N = vld1_f16(weight_hc_RUN);
-                _N = vfma_f16(_N, _weight_hc_N, _h_cont);
+                _gru_N = vfma_f16(_gru_N, _weight_hc_N, _h_cont);
 
                 weight_hc_RUN += 4;
             }
 
-            _N = vadd_f16(_N, _sum4);
+            _gru_N = vadd_f16(_gru_N, _sum4);
             _sum5 = vadd_f16(_sum5, _sum6);
-            _N = vadd_f16(_N, _sum5);
+            _gru_N = vadd_f16(_gru_N, _sum5);
 
-            _N = vfma_f16(vld1_f16(bias_c_RUBNWN + 12), vcvt_f16_f32(_R32), _N);
+            _gru_N = vfma_f16(vld1_f16(bias_c_RUBNWN + 12), vcvt_f16_f32(_R32), _gru_N);
             _sum4 = vdup_n_f16((__fp16)0.f);
             _sum5 = vdup_n_f16((__fp16)0.f);
             _sum6 = vdup_n_f16((__fp16)0.f);
@@ -525,13 +525,13 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                     "fmla   %5.4h, v3.4h, v4.h[3]   \n"
                     : "=r"(x),
                     "=r"(weight_xc_RUN),
-                    "=w"(_N),
+                    "=w"(_gru_N),
                     "=w"(_sum4),
                     "=w"(_sum5),
                     "=w"(_sum6)
                     : "0"(x),
                     "1"(weight_xc_RUN),
-                    "2"(_N),
+                    "2"(_gru_N),
                     "3"(_sum4),
                     "4"(_sum5),
                     "5"(_sum6)
@@ -543,17 +543,17 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
                 float16x4_t _xi = vdup_n_f16(xi);
                 float16x4_t _weight_xc_N = vld1_f16(weight_xc_RUN);
-                _N = vfma_f16(_N, _weight_xc_N, _xi);
+                _gru_N = vfma_f16(_gru_N, _weight_xc_N, _xi);
 
                 weight_xc_RUN += 4;
             }
 
-            _N = vadd_f16(_N, _sum4);
+            _gru_N = vadd_f16(_gru_N, _sum4);
             _sum5 = vadd_f16(_sum5, _sum6);
-            _N = vadd_f16(_N, _sum5);
+            _gru_N = vadd_f16(_gru_N, _sum5);
 
             // tanh(N)
-            float32x4_t _N32 = tanh_ps(vcvt_f32_f16(_N));
+            float32x4_t _N32 = tanh_ps(vcvt_f32_f16(_gru_N));
 
             float* gates_data = gates.row(q / 4);
 
@@ -645,13 +645,13 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
             const float* gates_data = gates.row(q / 4);
 
-            float32x4_t _U = vld1q_f32(gates_data);
-            float32x4_t _N = vld1q_f32(gates_data + 4);
+            float32x4_t _gru_U = vld1q_f32(gates_data);
+            float32x4_t _gru_N = vld1q_f32(gates_data + 4);
 
-            float32x4_t _H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _U), _N), vmulq_f32(_U, vld1q_f32(hidden_ptr + q)));
+            float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));
 
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_f16(output_data + q, vcvt_f16_f32(_H));
+            vst1q_f32(hidden_ptr + q, _gru_H);
+            vst1_f16(output_data + q, vcvt_f16_f32(_gru_H));
         }
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = remain_num_output_start; q < num_output; q++)
diff --git a/src/layer/arm/interp_bicubic_pack4.h b/src/layer/arm/interp_bicubic_pack4.h
index 2a4c7780526..ab0d561253e 100644
--- a/src/layer/arm/interp_bicubic_pack4.h
+++ b/src/layer/arm/interp_bicubic_pack4.h
@@ -254,11 +254,11 @@ static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, i
             float32x4_t _rows1 = vld1q_f32(rows1p);
             float32x4_t _rows2 = vld1q_f32(rows2p);
             float32x4_t _rows3 = vld1q_f32(rows3p);
-            float32x4_t _D = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
-            _D = vmlaq_lane_f32(_D, _rows1, vget_low_f32(_b0123), 1);
-            _D = vmlaq_lane_f32(_D, _rows2, vget_high_f32(_b0123), 0);
-            _D = vmlaq_lane_f32(_D, _rows3, vget_high_f32(_b0123), 1);
-            vst1q_f32(Dp, _D);
+            float32x4_t _Dp = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
+            _Dp = vmlaq_lane_f32(_Dp, _rows1, vget_low_f32(_b0123), 1);
+            _Dp = vmlaq_lane_f32(_Dp, _rows2, vget_high_f32(_b0123), 0);
+            _Dp = vmlaq_lane_f32(_Dp, _rows3, vget_high_f32(_b0123), 1);
+            vst1q_f32(Dp, _Dp);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/arm/interp_bicubic_pack4_bf16s.h b/src/layer/arm/interp_bicubic_pack4_bf16s.h
index b672f716d9e..6546accf6b5 100644
--- a/src/layer/arm/interp_bicubic_pack4_bf16s.h
+++ b/src/layer/arm/interp_bicubic_pack4_bf16s.h
@@ -254,11 +254,11 @@ static void resize_bicubic_image_pack4_bf16s(const Mat& src, Mat& dst, float* al
             float32x4_t _rows1 = vld1q_f32(rows1p);
             float32x4_t _rows2 = vld1q_f32(rows2p);
             float32x4_t _rows3 = vld1q_f32(rows3p);
-            float32x4_t _D = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
-            _D = vmlaq_lane_f32(_D, _rows1, vget_low_f32(_b0123), 1);
-            _D = vmlaq_lane_f32(_D, _rows2, vget_high_f32(_b0123), 0);
-            _D = vmlaq_lane_f32(_D, _rows3, vget_high_f32(_b0123), 1);
-            vst1_u16(Dp, float2bfloat(_D));
+            float32x4_t _Dp = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
+            _Dp = vmlaq_lane_f32(_Dp, _rows1, vget_low_f32(_b0123), 1);
+            _Dp = vmlaq_lane_f32(_Dp, _rows2, vget_high_f32(_b0123), 0);
+            _Dp = vmlaq_lane_f32(_Dp, _rows3, vget_high_f32(_b0123), 1);
+            vst1_u16(Dp, float2bfloat(_Dp));
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/arm/interp_bicubic_pack4_fp16s.h b/src/layer/arm/interp_bicubic_pack4_fp16s.h
index 76ad06a34ca..7f823148482 100644
--- a/src/layer/arm/interp_bicubic_pack4_fp16s.h
+++ b/src/layer/arm/interp_bicubic_pack4_fp16s.h
@@ -253,11 +253,11 @@ static void resize_bicubic_image_pack4_fp16s(const Mat& src, Mat& dst, float* al
             float32x4_t _rows1 = vld1q_f32(rows1p);
             float32x4_t _rows2 = vld1q_f32(rows2p);
             float32x4_t _rows3 = vld1q_f32(rows3p);
-            float32x4_t _D = vmulq_laneq_f32(_rows0, _b0123, 0);
-            _D = vfmaq_laneq_f32(_D, _rows1, _b0123, 1);
-            _D = vfmaq_laneq_f32(_D, _rows2, _b0123, 2);
-            _D = vfmaq_laneq_f32(_D, _rows3, _b0123, 3);
-            vst1_f16(Dp, vcvt_f16_f32(_D));
+            float32x4_t _Dp = vmulq_laneq_f32(_rows0, _b0123, 0);
+            _Dp = vfmaq_laneq_f32(_Dp, _rows1, _b0123, 1);
+            _Dp = vfmaq_laneq_f32(_Dp, _rows2, _b0123, 2);
+            _Dp = vfmaq_laneq_f32(_Dp, _rows3, _b0123, 3);
+            vst1_f16(Dp, vcvt_f16_f32(_Dp));
 
             Dp += 4;
             rows0p += 4;
@@ -511,11 +511,11 @@ static void resize_bicubic_image_pack4_fp16sa(const Mat& src, Mat& dst, __fp16*
             float16x4_t _rows1 = vld1_f16(rows1p);
             float16x4_t _rows2 = vld1_f16(rows2p);
             float16x4_t _rows3 = vld1_f16(rows3p);
-            float16x4_t _D = vmul_lane_f16(_rows0, _b0123, 0);
-            _D = vfma_lane_f16(_D, _rows1, _b0123, 1);
-            _D = vfma_lane_f16(_D, _rows2, _b0123, 2);
-            _D = vfma_lane_f16(_D, _rows3, _b0123, 3);
-            vst1_f16(Dp, _D);
+            float16x4_t _Dp = vmul_lane_f16(_rows0, _b0123, 0);
+            _Dp = vfma_lane_f16(_Dp, _rows1, _b0123, 1);
+            _Dp = vfma_lane_f16(_Dp, _rows2, _b0123, 2);
+            _Dp = vfma_lane_f16(_Dp, _rows3, _b0123, 3);
+            vst1_f16(Dp, _Dp);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/arm/interp_bicubic_pack8_fp16s.h b/src/layer/arm/interp_bicubic_pack8_fp16s.h
index 8d1eb4a24d2..75157b6d329 100644
--- a/src/layer/arm/interp_bicubic_pack8_fp16s.h
+++ b/src/layer/arm/interp_bicubic_pack8_fp16s.h
@@ -253,11 +253,11 @@ static void resize_bicubic_image_pack8_fp16sa(const Mat& src, Mat& dst, __fp16*
             float16x8_t _rows1 = vld1q_f16(rows1p);
             float16x8_t _rows2 = vld1q_f16(rows2p);
             float16x8_t _rows3 = vld1q_f16(rows3p);
-            float16x8_t _D = vmulq_lane_f16(_rows0, _b0123, 0);
-            _D = vfmaq_lane_f16(_D, _rows1, _b0123, 1);
-            _D = vfmaq_lane_f16(_D, _rows2, _b0123, 2);
-            _D = vfmaq_lane_f16(_D, _rows3, _b0123, 3);
-            vst1q_f16(Dp, _D);
+            float16x8_t _Dp = vmulq_lane_f16(_rows0, _b0123, 0);
+            _Dp = vfmaq_lane_f16(_Dp, _rows1, _b0123, 1);
+            _Dp = vfmaq_lane_f16(_Dp, _rows2, _b0123, 2);
+            _Dp = vfmaq_lane_f16(_Dp, _rows3, _b0123, 3);
+            vst1q_f16(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/arm/interp_bilinear.h b/src/layer/arm/interp_bilinear.h
index d35c20730be..38f00207447 100644
--- a/src/layer/arm/interp_bilinear.h
+++ b/src/layer/arm/interp_bilinear.h
@@ -193,18 +193,18 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
             float32x4_t _rows0 = vld1q_f32(rows0p);
             float32x4_t _rows1 = vld1q_f32(rows1p);
 
-            float32x4_t _D = vmulq_f32(_rows0, _b0);
-            _D = vmlaq_f32(_D, _rows1, _b1);
+            float32x4_t _Dp = vmulq_f32(_rows0, _b0);
+            _Dp = vmlaq_f32(_Dp, _rows1, _b1);
 
-            vst1q_f32(Dp, _D);
+            vst1q_f32(Dp, _Dp);
 
             float32x4_t _rows0n = vld1q_f32(rows0p + 4);
             float32x4_t _rows1n = vld1q_f32(rows1p + 4);
 
-            float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
-            _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
+            float32x4_t _Dpn = vmulq_f32(_rows0n, _b0);
+            _Dpn = vmlaq_f32(_Dpn, _rows1n, _b1);
 
-            vst1q_f32(Dp + 4, _Dn);
+            vst1q_f32(Dp + 4, _Dpn);
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/arm/interp_bilinear_bf16s.h b/src/layer/arm/interp_bilinear_bf16s.h
index 1b061f29192..84583f83dcd 100644
--- a/src/layer/arm/interp_bilinear_bf16s.h
+++ b/src/layer/arm/interp_bilinear_bf16s.h
@@ -106,18 +106,18 @@ static void resize_bilinear_image_bf16s(const Mat& src, Mat& dst, float* alpha,
             float32x4_t _rows0 = vld1q_f32(rows0p);
             float32x4_t _rows1 = vld1q_f32(rows1p);
 
-            float32x4_t _D = vmulq_f32(_rows0, _b0);
-            _D = vmlaq_f32(_D, _rows1, _b1);
+            float32x4_t _Dp = vmulq_f32(_rows0, _b0);
+            _Dp = vmlaq_f32(_Dp, _rows1, _b1);
 
-            vst1_u16(Dp, float2bfloat(_D));
+            vst1_u16(Dp, float2bfloat(_Dp));
 
             float32x4_t _rows0n = vld1q_f32(rows0p + 4);
             float32x4_t _rows1n = vld1q_f32(rows1p + 4);
 
-            float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
-            _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
+            float32x4_t _Dpn = vmulq_f32(_rows0n, _b0);
+            _Dpn = vmlaq_f32(_Dpn, _rows1n, _b1);
 
-            vst1_u16(Dp + 4, float2bfloat(_Dn));
+            vst1_u16(Dp + 4, float2bfloat(_Dpn));
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/arm/interp_bilinear_fp16s.h b/src/layer/arm/interp_bilinear_fp16s.h
index 62b0fac32da..33314e7a1df 100644
--- a/src/layer/arm/interp_bilinear_fp16s.h
+++ b/src/layer/arm/interp_bilinear_fp16s.h
@@ -138,10 +138,10 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha,
             float32x4_t _rows0 = vld1q_f32(rows0p);
             float32x4_t _rows1 = vld1q_f32(rows1p);
 
-            float32x4_t _D = vmulq_f32(_rows0, _b0);
-            _D = vfmaq_f32(_D, _rows1, _b1);
+            float32x4_t _Dp = vmulq_f32(_rows0, _b0);
+            _Dp = vfmaq_f32(_Dp, _rows1, _b1);
 
-            vst1_f16(Dp, vcvt_f16_f32(_D));
+            vst1_f16(Dp, vcvt_f16_f32(_Dp));
 
             float32x4_t _rows0n = vld1q_f32(rows0p + 4);
             float32x4_t _rows1n = vld1q_f32(rows1p + 4);
@@ -254,10 +254,10 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha
             float16x8_t _rows0 = vld1q_f16(rows0p);
             float16x8_t _rows1 = vld1q_f16(rows1p);
 
-            float16x8_t _D = vmulq_f16(_rows0, _b0);
-            _D = vfmaq_f16(_D, _rows1, _b1);
+            float16x8_t _Dp = vmulq_f16(_rows0, _b0);
+            _Dp = vfmaq_f16(_Dp, _rows1, _b1);
 
-            vst1q_f16(Dp, _D);
+            vst1q_f16(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/arm/interp_bilinear_pack4.h b/src/layer/arm/interp_bilinear_pack4.h
index f8c92079436..46f20485e88 100644
--- a/src/layer/arm/interp_bilinear_pack4.h
+++ b/src/layer/arm/interp_bilinear_pack4.h
@@ -106,9 +106,9 @@ static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha,
         {
             float32x4_t _rows0 = vld1q_f32(rows0p);
             float32x4_t _rows1 = vld1q_f32(rows1p);
-            float32x4_t _D = vmulq_lane_f32(_rows0, _b01, 0);
-            _D = vmlaq_lane_f32(_D, _rows1, _b01, 1);
-            vst1q_f32(Dp, _D);
+            float32x4_t _Dp = vmulq_lane_f32(_rows0, _b01, 0);
+            _Dp = vmlaq_lane_f32(_Dp, _rows1, _b01, 1);
+            vst1q_f32(Dp, _Dp);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/arm/interp_bilinear_pack4_bf16s.h b/src/layer/arm/interp_bilinear_pack4_bf16s.h
index 5d4e8222ddc..87ebac503f9 100644
--- a/src/layer/arm/interp_bilinear_pack4_bf16s.h
+++ b/src/layer/arm/interp_bilinear_pack4_bf16s.h
@@ -106,9 +106,9 @@ static void resize_bilinear_image_pack4_bf16s(const Mat& src, Mat& dst, float* a
         {
             float32x4_t _rows0 = vld1q_f32(rows0p);
             float32x4_t _rows1 = vld1q_f32(rows1p);
-            float32x4_t _D = vmulq_lane_f32(_rows0, _b01, 0);
-            _D = vmlaq_lane_f32(_D, _rows1, _b01, 1);
-            vst1_u16(Dp, float2bfloat(_D));
+            float32x4_t _Dp = vmulq_lane_f32(_rows0, _b01, 0);
+            _Dp = vmlaq_lane_f32(_Dp, _rows1, _b01, 1);
+            vst1_u16(Dp, float2bfloat(_Dp));
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/arm/interp_bilinear_pack4_fp16s.h b/src/layer/arm/interp_bilinear_pack4_fp16s.h
index 12f2fa8d037..0b6c6de0092 100644
--- a/src/layer/arm/interp_bilinear_pack4_fp16s.h
+++ b/src/layer/arm/interp_bilinear_pack4_fp16s.h
@@ -106,9 +106,9 @@ static void resize_bilinear_image_pack4_fp16s(const Mat& src, Mat& dst, float* a
         {
             float32x4_t _rows0 = vld1q_f32(rows0p);
             float32x4_t _rows1 = vld1q_f32(rows1p);
-            float32x4_t _D = vmulq_lane_f32(_rows0, _b01, 0);
-            _D = vmlaq_lane_f32(_D, _rows1, _b01, 1);
-            vst1_f16(Dp, vcvt_f16_f32(_D));
+            float32x4_t _Dp = vmulq_lane_f32(_rows0, _b01, 0);
+            _Dp = vmlaq_lane_f32(_Dp, _rows1, _b01, 1);
+            vst1_f16(Dp, vcvt_f16_f32(_Dp));
 
             Dp += 4;
             rows0p += 4;
@@ -213,9 +213,9 @@ static void resize_bilinear_image_pack4_fp16sa(const Mat& src, Mat& dst, __fp16*
         {
             float16x4_t _rows0 = vld1_f16(rows0p);
             float16x4_t _rows1 = vld1_f16(rows1p);
-            float16x4_t _D = vmul_lane_f16(_rows0, _b01, 0);
-            _D = vfma_lane_f16(_D, _rows1, _b01, 1);
-            vst1_f16(Dp, _D);
+            float16x4_t _Dp = vmul_lane_f16(_rows0, _b01, 0);
+            _Dp = vfma_lane_f16(_Dp, _rows1, _b01, 1);
+            vst1_f16(Dp, _Dp);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/arm/interp_bilinear_pack8_fp16s.h b/src/layer/arm/interp_bilinear_pack8_fp16s.h
index 22eaf72e47f..c0f84b01262 100644
--- a/src/layer/arm/interp_bilinear_pack8_fp16s.h
+++ b/src/layer/arm/interp_bilinear_pack8_fp16s.h
@@ -106,9 +106,9 @@ static void resize_bilinear_image_pack8_fp16sa(const Mat& src, Mat& dst, __fp16*
         {
             float16x8_t _rows0 = vld1q_f16(rows0p);
             float16x8_t _rows1 = vld1q_f16(rows1p);
-            float16x8_t _D = vmulq_lane_f16(_rows0, _b01, 0);
-            _D = vfmaq_lane_f16(_D, _rows1, _b01, 1);
-            vst1q_f16(Dp, _D);
+            float16x8_t _Dp = vmulq_lane_f16(_rows0, _b01, 0);
+            _Dp = vfmaq_lane_f16(_Dp, _rows1, _b01, 1);
+            vst1q_f16(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/arm/lstm_arm.cpp b/src/layer/arm/lstm_arm.cpp
index c34c2d1d0ed..79a0c97c917 100644
--- a/src/layer/arm/lstm_arm.cpp
+++ b/src/layer/arm/lstm_arm.cpp
@@ -323,24 +323,24 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
 
             float32x4x4_t _IFOG_4x4 = vld4q_f32(gates_data);
 
-            float32x4_t _I = sigmoid_ps(_IFOG_4x4.val[0]);
-            float32x4_t _F = sigmoid_ps(_IFOG_4x4.val[1]);
-            float32x4_t _O = sigmoid_ps(_IFOG_4x4.val[2]);
-            float32x4_t _G = tanh_ps(_IFOG_4x4.val[3]);
+            float32x4_t _lstm_I = sigmoid_ps(_IFOG_4x4.val[0]);
+            float32x4_t _lstm_F = sigmoid_ps(_IFOG_4x4.val[1]);
+            float32x4_t _lstm_O = sigmoid_ps(_IFOG_4x4.val[2]);
+            float32x4_t _lstm_G = tanh_ps(_IFOG_4x4.val[3]);
 
-            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_I, _G));
-            float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2));
+            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
+            float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));
 
             vst1q_f32(cell_ptr + q, _cell2);
 
             if (num_output == hidden_size)
             {
-                vst1q_f32(hidden_ptr + q, _H);
-                vst1q_f32(output_data + q, _H);
+                vst1q_f32(hidden_ptr + q, _lstm_H);
+                vst1q_f32(output_data + q, _lstm_H);
             }
             else
             {
-                vst1q_f32(tmp_hidden_ptr + q, _H);
+                vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
             }
         }
 #endif // __ARM_NEON
@@ -778,24 +778,24 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
             float32x4x4_t _IFOG_4x4 = vld4q_f32(gates_data);
 
-            float32x4_t _I = sigmoid_ps(_IFOG_4x4.val[0]);
-            float32x4_t _F = sigmoid_ps(_IFOG_4x4.val[1]);
-            float32x4_t _O = sigmoid_ps(_IFOG_4x4.val[2]);
-            float32x4_t _G = tanh_ps(_IFOG_4x4.val[3]);
+            float32x4_t _lstm_I = sigmoid_ps(_IFOG_4x4.val[0]);
+            float32x4_t _lstm_F = sigmoid_ps(_IFOG_4x4.val[1]);
+            float32x4_t _lstm_O = sigmoid_ps(_IFOG_4x4.val[2]);
+            float32x4_t _lstm_G = tanh_ps(_IFOG_4x4.val[3]);
 
-            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_I, _G));
-            float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2));
+            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
+            float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));
 
             vst1q_f32(cell_ptr + q, _cell2);
 
             if (num_output == hidden_size)
             {
-                vst1q_f32(hidden_ptr + q, _H);
-                vst1_u16(output_data + q, float2bfloat(_H));
+                vst1q_f32(hidden_ptr + q, _lstm_H);
+                vst1_u16(output_data + q, float2bfloat(_lstm_H));
             }
             else
             {
-                vst1q_f32(tmp_hidden_ptr + q, _H);
+                vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
             }
         }
 #endif // __ARM_NEON
diff --git a/src/layer/arm/lstm_arm_asimdhp.cpp b/src/layer/arm/lstm_arm_asimdhp.cpp
index eea2e8ce41d..a394bad4c2e 100644
--- a/src/layer/arm/lstm_arm_asimdhp.cpp
+++ b/src/layer/arm/lstm_arm_asimdhp.cpp
@@ -163,24 +163,24 @@ static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
             float32x4x4_t _IFOG_4x4 = vld4q_f32(gates_data);
 
-            float32x4_t _I = sigmoid_ps(_IFOG_4x4.val[0]);
-            float32x4_t _F = sigmoid_ps(_IFOG_4x4.val[1]);
-            float32x4_t _O = sigmoid_ps(_IFOG_4x4.val[2]);
-            float32x4_t _G = tanh_ps(_IFOG_4x4.val[3]);
+            float32x4_t _lstm_I = sigmoid_ps(_IFOG_4x4.val[0]);
+            float32x4_t _lstm_F = sigmoid_ps(_IFOG_4x4.val[1]);
+            float32x4_t _lstm_O = sigmoid_ps(_IFOG_4x4.val[2]);
+            float32x4_t _lstm_G = tanh_ps(_IFOG_4x4.val[3]);
 
-            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_I, _G));
-            float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2));
+            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
+            float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));
 
             vst1q_f32(cell_ptr + q, _cell2);
 
             if (num_output == hidden_size)
             {
-                vst1q_f32(hidden_ptr + q, _H);
-                vst1_f16(output_data + q, vcvt_f16_f32(_H));
+                vst1q_f32(hidden_ptr + q, _lstm_H);
+                vst1_f16(output_data + q, vcvt_f16_f32(_lstm_H));
             }
             else
             {
-                vst1q_f32(tmp_hidden_ptr + q, _H);
+                vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
             }
         }
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -503,24 +503,24 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
             float16x4x4_t _IFOG_4x4 = vld4_f16(gates_data);
 
-            float32x4_t _I = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[0]));
-            float32x4_t _F = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[1]));
-            float32x4_t _O = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[2]));
-            float32x4_t _G = tanh_ps(vcvt_f32_f16(_IFOG_4x4.val[3]));
+            float32x4_t _lstm_I = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[0]));
+            float32x4_t _lstm_F = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[1]));
+            float32x4_t _lstm_O = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[2]));
+            float32x4_t _lstm_G = tanh_ps(vcvt_f32_f16(_IFOG_4x4.val[3]));
 
-            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_I, _G));
-            float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2));
+            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
+            float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));
 
             vst1q_f32(cell_ptr + q, _cell2);
 
             if (num_output == hidden_size)
             {
-                vst1q_f32(hidden_ptr + q, _H);
-                vst1_f16(output_data + q, vcvt_f16_f32(_H));
+                vst1q_f32(hidden_ptr + q, _lstm_H);
+                vst1_f16(output_data + q, vcvt_f16_f32(_lstm_H));
             }
             else
             {
-                vst1q_f32(tmp_hidden_ptr + q, _H);
+                vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
             }
         }
         #pragma omp parallel for num_threads(opt.num_threads)
diff --git a/src/layer/arm/rnn_arm.cpp b/src/layer/arm/rnn_arm.cpp
index 05ef13fff57..87892d7ada2 100644
--- a/src/layer/arm/rnn_arm.cpp
+++ b/src/layer/arm/rnn_arm.cpp
@@ -176,7 +176,7 @@ static int rnn(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* weight_xc_ptr = weight_xc.row(q / 4);
             const float* weight_hc_ptr = weight_hc.row(q / 4);
 
-            float32x4_t _H = vld1q_f32((const float*)bias_c + q);
+            float32x4_t _rnn_H = vld1q_f32((const float*)bias_c + q);
             float32x4_t _sum1 = vdupq_n_f32(0.f);
             float32x4_t _sum2 = vdupq_n_f32(0.f);
             float32x4_t _sum3 = vdupq_n_f32(0.f);
@@ -190,12 +190,12 @@ static int rnn(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 float32x4_t _weight_xc_2 = vld1q_f32(weight_xc_ptr + 8);
                 float32x4_t _weight_xc_3 = vld1q_f32(weight_xc_ptr + 12);
 #if __aarch64__
-                _H = vfmaq_laneq_f32(_H, _weight_xc, _x, 0);
+                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_xc, _x, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_1, _x, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_2, _x, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_3, _x, 3);
 #else
-                _H = vmlaq_lane_f32(_H, _weight_xc, vget_low_f32(_x), 0);
+                _rnn_H = vmlaq_lane_f32(_rnn_H, _weight_xc, vget_low_f32(_x), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_1, vget_low_f32(_x), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_2, vget_high_f32(_x), 0);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_3, vget_high_f32(_x), 1);
@@ -207,7 +207,7 @@ static int rnn(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             {
                 float32x4_t _x = vdupq_n_f32(x[i]);
                 float32x4_t _weight_xc = vld1q_f32(weight_xc_ptr);
-                _H = vmlaq_f32(_H, _weight_xc, _x);
+                _rnn_H = vmlaq_f32(_rnn_H, _weight_xc, _x);
 
                 weight_xc_ptr += 4;
             }
@@ -221,12 +221,12 @@ static int rnn(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
                 float32x4_t _weight_hc_2 = vld1q_f32(weight_hc_ptr + 8);
                 float32x4_t _weight_hc_3 = vld1q_f32(weight_hc_ptr + 12);
 #if __aarch64__
-                _H = vfmaq_laneq_f32(_H, _weight_hc, _hidden_state, 0);
+                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_hc, _hidden_state, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_1, _hidden_state, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_2, _hidden_state, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_3, _hidden_state, 3);
 #else
-                _H = vmlaq_lane_f32(_H, _weight_hc, vget_low_f32(_hidden_state), 0);
+                _rnn_H = vmlaq_lane_f32(_rnn_H, _weight_hc, vget_low_f32(_hidden_state), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_1, vget_low_f32(_hidden_state), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_2, vget_high_f32(_hidden_state), 0);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_3, vget_high_f32(_hidden_state), 1);
@@ -238,18 +238,18 @@ static int rnn(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             {
                 float32x4_t _hidden_state = vdupq_n_f32(hidden_state[i]);
                 float32x4_t _weight_hc = vld1q_f32(weight_hc_ptr);
-                _H = vmlaq_f32(_H, _weight_hc, _hidden_state);
+                _rnn_H = vmlaq_f32(_rnn_H, _weight_hc, _hidden_state);
 
                 weight_hc_ptr += 4;
             }
 
-            _H = vaddq_f32(_H, _sum1);
+            _rnn_H = vaddq_f32(_rnn_H, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _H = vaddq_f32(_H, _sum2);
+            _rnn_H = vaddq_f32(_rnn_H, _sum2);
 
-            _H = tanh_ps(_H);
+            _rnn_H = tanh_ps(_rnn_H);
 
-            vst1q_f32((float*)gates + q, _H);
+            vst1q_f32((float*)gates + q, _rnn_H);
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -293,10 +293,10 @@ static int rnn(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
         {
             int q = qq * 4;
 
-            float32x4_t _H = vld1q_f32((float*)gates + q);
+            float32x4_t _rnn_H = vld1q_f32((float*)gates + q);
 
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1q_f32(output_data + q, _H);
+            vst1q_f32(hidden_ptr + q, _rnn_H);
+            vst1q_f32(output_data + q, _rnn_H);
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -511,7 +511,7 @@ static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const unsigned short* weight_xc_ptr = weight_xc.row<const unsigned short>(q / 4);
             const unsigned short* weight_hc_ptr = weight_hc.row<const unsigned short>(q / 4);
 
-            float32x4_t _H = bfloat2float(vld1_u16((const unsigned short*)bias_c + q));
+            float32x4_t _rnn_H = bfloat2float(vld1_u16((const unsigned short*)bias_c + q));
             float32x4_t _sum1 = vdupq_n_f32(0.f);
             float32x4_t _sum2 = vdupq_n_f32(0.f);
             float32x4_t _sum3 = vdupq_n_f32(0.f);
@@ -525,12 +525,12 @@ static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_xc_2 = bfloat2float(vld1_u16(weight_xc_ptr + 8));
                 float32x4_t _weight_xc_3 = bfloat2float(vld1_u16(weight_xc_ptr + 12));
 #if __aarch64__
-                _H = vfmaq_laneq_f32(_H, _weight_xc, _x, 0);
+                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_xc, _x, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_1, _x, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_2, _x, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_3, _x, 3);
 #else
-                _H = vmlaq_lane_f32(_H, _weight_xc, vget_low_f32(_x), 0);
+                _rnn_H = vmlaq_lane_f32(_rnn_H, _weight_xc, vget_low_f32(_x), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_1, vget_low_f32(_x), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_2, vget_high_f32(_x), 0);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_3, vget_high_f32(_x), 1);
@@ -542,7 +542,7 @@ static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             {
                 float32x4_t _x = bfloat2float(vdup_n_u16(x[i]));
                 float32x4_t _weight_xc = bfloat2float(vld1_u16(weight_xc_ptr));
-                _H = vmlaq_f32(_H, _weight_xc, _x);
+                _rnn_H = vmlaq_f32(_rnn_H, _weight_xc, _x);
 
                 weight_xc_ptr += 4;
             }
@@ -556,12 +556,12 @@ static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_hc_2 = bfloat2float(vld1_u16(weight_hc_ptr + 8));
                 float32x4_t _weight_hc_3 = bfloat2float(vld1_u16(weight_hc_ptr + 12));
 #if __aarch64__
-                _H = vfmaq_laneq_f32(_H, _weight_hc, _hidden_state, 0);
+                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_hc, _hidden_state, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_1, _hidden_state, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_2, _hidden_state, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_3, _hidden_state, 3);
 #else
-                _H = vmlaq_lane_f32(_H, _weight_hc, vget_low_f32(_hidden_state), 0);
+                _rnn_H = vmlaq_lane_f32(_rnn_H, _weight_hc, vget_low_f32(_hidden_state), 0);
                 _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_1, vget_low_f32(_hidden_state), 1);
                 _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_2, vget_high_f32(_hidden_state), 0);
                 _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_3, vget_high_f32(_hidden_state), 1);
@@ -573,18 +573,18 @@ static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             {
                 float32x4_t _hidden_state = vdupq_n_f32(hidden_state[i]);
                 float32x4_t _weight_hc = bfloat2float(vld1_u16(weight_hc_ptr));
-                _H = vmlaq_f32(_H, _weight_hc, _hidden_state);
+                _rnn_H = vmlaq_f32(_rnn_H, _weight_hc, _hidden_state);
 
                 weight_hc_ptr += 4;
             }
 
-            _H = vaddq_f32(_H, _sum1);
+            _rnn_H = vaddq_f32(_rnn_H, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _H = vaddq_f32(_H, _sum2);
+            _rnn_H = vaddq_f32(_rnn_H, _sum2);
 
-            _H = tanh_ps(_H);
+            _rnn_H = tanh_ps(_rnn_H);
 
-            vst1q_f32((float*)gates + q, _H);
+            vst1q_f32((float*)gates + q, _rnn_H);
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -628,10 +628,10 @@ static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
         {
             int q = qq * 4;
 
-            float32x4_t _H = vld1q_f32((float*)gates + q);
+            float32x4_t _rnn_H = vld1q_f32((float*)gates + q);
 
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_u16(output_data + q, float2bfloat(_H));
+            vst1q_f32(hidden_ptr + q, _rnn_H);
+            vst1_u16(output_data + q, float2bfloat(_rnn_H));
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
diff --git a/src/layer/arm/rnn_arm_asimdhp.cpp b/src/layer/arm/rnn_arm_asimdhp.cpp
index d7aa135e584..79fb0b1db1e 100644
--- a/src/layer/arm/rnn_arm_asimdhp.cpp
+++ b/src/layer/arm/rnn_arm_asimdhp.cpp
@@ -54,7 +54,7 @@ static int rnn_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const __fp16* weight_xc_ptr = weight_xc.row<const __fp16>(q / 4);
             const __fp16* weight_hc_ptr = weight_hc.row<const __fp16>(q / 4);
 
-            float32x4_t _H = vcvt_f32_f16(vld1_f16((const __fp16*)bias_c + q));
+            float32x4_t _rnn_H = vcvt_f32_f16(vld1_f16((const __fp16*)bias_c + q));
             float32x4_t _sum1 = vdupq_n_f32(0.f);
             float32x4_t _sum2 = vdupq_n_f32(0.f);
             float32x4_t _sum3 = vdupq_n_f32(0.f);
@@ -67,7 +67,7 @@ static int rnn_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_xc_1 = vcvt_f32_f16(vld1_f16(weight_xc_ptr + 4));
                 float32x4_t _weight_xc_2 = vcvt_f32_f16(vld1_f16(weight_xc_ptr + 8));
                 float32x4_t _weight_xc_3 = vcvt_f32_f16(vld1_f16(weight_xc_ptr + 12));
-                _H = vfmaq_laneq_f32(_H, _weight_xc, _x, 0);
+                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_xc, _x, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_1, _x, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_2, _x, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_3, _x, 3);
@@ -78,7 +78,7 @@ static int rnn_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             {
                 float32x4_t _x = vcvt_f32_f16(vdup_n_f16(x[i]));
                 float32x4_t _weight_xc = vcvt_f32_f16(vld1_f16(weight_xc_ptr));
-                _H = vfmaq_f32(_H, _weight_xc, _x);
+                _rnn_H = vfmaq_f32(_rnn_H, _weight_xc, _x);
 
                 weight_xc_ptr += 4;
             }
@@ -91,7 +91,7 @@ static int rnn_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
                 float32x4_t _weight_hc_1 = vcvt_f32_f16(vld1_f16(weight_hc_ptr + 4));
                 float32x4_t _weight_hc_2 = vcvt_f32_f16(vld1_f16(weight_hc_ptr + 8));
                 float32x4_t _weight_hc_3 = vcvt_f32_f16(vld1_f16(weight_hc_ptr + 12));
-                _H = vfmaq_laneq_f32(_H, _weight_hc, _hidden_state, 0);
+                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_hc, _hidden_state, 0);
                 _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_1, _hidden_state, 1);
                 _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_2, _hidden_state, 2);
                 _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_3, _hidden_state, 3);
@@ -102,18 +102,18 @@ static int rnn_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             {
                 float32x4_t _hidden_state = vdupq_n_f32(hidden_state[i]);
                 float32x4_t _weight_hc = vcvt_f32_f16(vld1_f16(weight_hc_ptr));
-                _H = vfmaq_f32(_H, _weight_hc, _hidden_state);
+                _rnn_H = vfmaq_f32(_rnn_H, _weight_hc, _hidden_state);
 
                 weight_hc_ptr += 4;
             }
 
-            _H = vaddq_f32(_H, _sum1);
+            _rnn_H = vaddq_f32(_rnn_H, _sum1);
             _sum2 = vaddq_f32(_sum2, _sum3);
-            _H = vaddq_f32(_H, _sum2);
+            _rnn_H = vaddq_f32(_rnn_H, _sum2);
 
-            _H = tanh_ps(_H);
+            _rnn_H = tanh_ps(_rnn_H);
 
-            vst1q_f32((float*)gates + q, _H);
+            vst1q_f32((float*)gates + q, _rnn_H);
         }
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = remain_num_output_start; q < num_output; q++)
@@ -149,10 +149,10 @@ static int rnn_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
         {
             int q = qq * 4;
 
-            float32x4_t _H = vld1q_f32((float*)gates + q);
+            float32x4_t _rnn_H = vld1q_f32((float*)gates + q);
 
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_f16(output_data + q, vcvt_f16_f32(_H));
+            vst1q_f32(hidden_ptr + q, _rnn_H);
+            vst1_f16(output_data + q, vcvt_f16_f32(_rnn_H));
         }
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = remain_num_output_start; q < num_output; q++)
@@ -196,7 +196,7 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* weight_xc_ptr = weight_xc.row<const __fp16>(q / 8);
             const __fp16* weight_hc_ptr = weight_hc.row<const __fp16>(q / 8);
 
-            float16x8_t _H = vld1q_f16((const __fp16*)bias_c + q);
+            float16x8_t _rnn_H = vld1q_f16((const __fp16*)bias_c + q);
             float16x8_t _sum1 = vdupq_n_f16(0.f);
             float16x8_t _sum2 = vdupq_n_f16(0.f);
             float16x8_t _sum3 = vdupq_n_f16(0.f);
@@ -209,7 +209,7 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                 float16x8_t _weight_xc_1 = vld1q_f16(weight_xc_ptr + 8);
                 float16x8_t _weight_xc_2 = vld1q_f16(weight_xc_ptr + 16);
                 float16x8_t _weight_xc_3 = vld1q_f16(weight_xc_ptr + 24);
-                _H = vfmaq_lane_f16(_H, _weight_xc, _x, 0);
+                _rnn_H = vfmaq_lane_f16(_rnn_H, _weight_xc, _x, 0);
                 _sum1 = vfmaq_lane_f16(_sum1, _weight_xc_1, _x, 1);
                 _sum2 = vfmaq_lane_f16(_sum2, _weight_xc_2, _x, 2);
                 _sum3 = vfmaq_lane_f16(_sum3, _weight_xc_3, _x, 3);
@@ -220,7 +220,7 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             {
                 float16x8_t _x = vdupq_n_f16(x[i]);
                 float16x8_t _weight_xc = vld1q_f16(weight_xc_ptr);
-                _H = vfmaq_f16(_H, _weight_xc, _x);
+                _rnn_H = vfmaq_f16(_rnn_H, _weight_xc, _x);
 
                 weight_xc_ptr += 8;
             }
@@ -233,7 +233,7 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                 float16x8_t _weight_hc_1 = vld1q_f16(weight_hc_ptr + 8);
                 float16x8_t _weight_hc_2 = vld1q_f16(weight_hc_ptr + 16);
                 float16x8_t _weight_hc_3 = vld1q_f16(weight_hc_ptr + 24);
-                _H = vfmaq_lane_f16(_H, _weight_hc, _hidden_state, 0);
+                _rnn_H = vfmaq_lane_f16(_rnn_H, _weight_hc, _hidden_state, 0);
                 _sum1 = vfmaq_lane_f16(_sum1, _weight_hc_1, _hidden_state, 1);
                 _sum2 = vfmaq_lane_f16(_sum2, _weight_hc_2, _hidden_state, 2);
                 _sum3 = vfmaq_lane_f16(_sum3, _weight_hc_3, _hidden_state, 3);
@@ -244,17 +244,17 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             {
                 float16x8_t _hidden_state = vdupq_n_f16((__fp16)hidden_state[i]);
                 float16x8_t _weight_hc = vld1q_f16(weight_hc_ptr);
-                _H = vfmaq_f16(_H, _weight_hc, _hidden_state);
+                _rnn_H = vfmaq_f16(_rnn_H, _weight_hc, _hidden_state);
 
                 weight_hc_ptr += 8;
             }
 
-            _H = vaddq_f16(_H, _sum1);
+            _rnn_H = vaddq_f16(_rnn_H, _sum1);
             _sum2 = vaddq_f16(_sum2, _sum3);
-            _H = vaddq_f16(_H, _sum2);
+            _rnn_H = vaddq_f16(_rnn_H, _sum2);
 
-            float32x4_t _H32low = tanh_ps(vcvt_f32_f16(vget_low_f16(_H)));
-            float32x4_t _H32high = tanh_ps(vcvt_f32_f16(vget_high_f16(_H)));
+            float32x4_t _H32low = tanh_ps(vcvt_f32_f16(vget_low_f16(_rnn_H)));
+            float32x4_t _H32high = tanh_ps(vcvt_f32_f16(vget_high_f16(_rnn_H)));
 
             vst1q_f32((float*)gates + q, _H32low);
             vst1q_f32((float*)gates + q + 4, _H32high);
@@ -268,7 +268,7 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* weight_xc_ptr = weight_xc.row<const __fp16>(q / 8 + (q % 8) / 4);
             const __fp16* weight_hc_ptr = weight_hc.row<const __fp16>(q / 8 + (q % 8) / 4);
 
-            float16x4_t _H = vld1_f16((const __fp16*)bias_c + q);
+            float16x4_t _rnn_H = vld1_f16((const __fp16*)bias_c + q);
             float16x4_t _sum1 = vdup_n_f16(0.f);
             float16x4_t _sum2 = vdup_n_f16(0.f);
             float16x4_t _sum3 = vdup_n_f16(0.f);
@@ -281,7 +281,7 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                 float16x4_t _weight_xc_1 = vld1_f16(weight_xc_ptr + 4);
                 float16x4_t _weight_xc_2 = vld1_f16(weight_xc_ptr + 8);
                 float16x4_t _weight_xc_3 = vld1_f16(weight_xc_ptr + 12);
-                _H = vfma_lane_f16(_H, _weight_xc, _x, 0);
+                _rnn_H = vfma_lane_f16(_rnn_H, _weight_xc, _x, 0);
                 _sum1 = vfma_lane_f16(_sum1, _weight_xc_1, _x, 1);
                 _sum2 = vfma_lane_f16(_sum2, _weight_xc_2, _x, 2);
                 _sum3 = vfma_lane_f16(_sum3, _weight_xc_3, _x, 3);
@@ -292,7 +292,7 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             {
                 float16x4_t _x = vdup_n_f16(x[i]);
                 float16x4_t _weight_xc = vld1_f16(weight_xc_ptr);
-                _H = vfma_f16(_H, _weight_xc, _x);
+                _rnn_H = vfma_f16(_rnn_H, _weight_xc, _x);
 
                 weight_xc_ptr += 4;
             }
@@ -305,7 +305,7 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
                 float16x4_t _weight_hc_1 = vld1_f16(weight_hc_ptr + 4);
                 float16x4_t _weight_hc_2 = vld1_f16(weight_hc_ptr + 8);
                 float16x4_t _weight_hc_3 = vld1_f16(weight_hc_ptr + 12);
-                _H = vfma_lane_f16(_H, _weight_hc, _hidden_state, 0);
+                _rnn_H = vfma_lane_f16(_rnn_H, _weight_hc, _hidden_state, 0);
                 _sum1 = vfma_lane_f16(_sum1, _weight_hc_1, _hidden_state, 1);
                 _sum2 = vfma_lane_f16(_sum2, _weight_hc_2, _hidden_state, 2);
                 _sum3 = vfma_lane_f16(_sum3, _weight_hc_3, _hidden_state, 3);
@@ -316,16 +316,16 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             {
                 float16x4_t _hidden_state = vdup_n_f16((__fp16)hidden_state[i]);
                 float16x4_t _weight_hc = vld1_f16(weight_hc_ptr);
-                _H = vfma_f16(_H, _weight_hc, _hidden_state);
+                _rnn_H = vfma_f16(_rnn_H, _weight_hc, _hidden_state);
 
                 weight_hc_ptr += 4;
             }
 
-            _H = vadd_f16(_H, _sum1);
+            _rnn_H = vadd_f16(_rnn_H, _sum1);
             _sum2 = vadd_f16(_sum2, _sum3);
-            _H = vadd_f16(_H, _sum2);
+            _rnn_H = vadd_f16(_rnn_H, _sum2);
 
-            float32x4_t _H32 = tanh_ps(vcvt_f32_f16(_H));
+            float32x4_t _H32 = tanh_ps(vcvt_f32_f16(_rnn_H));
 
             vst1q_f32((float*)gates + q, _H32);
         }
@@ -364,10 +364,10 @@ static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
         {
             int q = qq * 4;
 
-            float32x4_t _H = vld1q_f32((float*)gates + q);
+            float32x4_t _rnn_H = vld1q_f32((float*)gates + q);
 
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_f16(output_data + q, vcvt_f16_f32(_H));
+            vst1q_f32(hidden_ptr + q, _rnn_H);
+            vst1_f16(output_data + q, vcvt_f16_f32(_rnn_H));
         }
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = remain_num_output_start; q < num_output; q++)
diff --git a/src/layer/loongarch/interp_bicubic_pack4.h b/src/layer/loongarch/interp_bicubic_pack4.h
index 54281691ad7..6e52dc1e469 100644
--- a/src/layer/loongarch/interp_bicubic_pack4.h
+++ b/src/layer/loongarch/interp_bicubic_pack4.h
@@ -268,11 +268,11 @@ static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, i
             __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
             __m128 _rows2 = (__m128)__lsx_vld(rows2p, 0);
             __m128 _rows3 = (__m128)__lsx_vld(rows3p, 0);
-            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
-            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
-            _D = __lsx_vfmadd_s(_b2, _rows2, _D);
-            _D = __lsx_vfmadd_s(_b3, _rows3, _D);
-            __lsx_vst(_D, Dp, 0);
+            __m128 _Dp = __lsx_vfmul_s(_rows0, _b0);
+            _Dp = __lsx_vfmadd_s(_b1, _rows1, _Dp);
+            _Dp = __lsx_vfmadd_s(_b2, _rows2, _Dp);
+            _Dp = __lsx_vfmadd_s(_b3, _rows3, _Dp);
+            __lsx_vst(_Dp, Dp, 0);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/loongarch/interp_bilinear.h b/src/layer/loongarch/interp_bilinear.h
index ad5a28672be..8ace1868644 100644
--- a/src/layer/loongarch/interp_bilinear.h
+++ b/src/layer/loongarch/interp_bilinear.h
@@ -143,18 +143,18 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
             __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
             __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
 
-            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
-            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
+            __m128 _Dp = __lsx_vfmul_s(_rows0, _b0);
+            _Dp = __lsx_vfmadd_s(_b1, _rows1, _Dp);
 
-            __lsx_vst(_D, Dp, 0);
+            __lsx_vst(_Dp, Dp, 0);
 
             __m128 _rows0n = (__m128)__lsx_vld(rows0p + 4, 0);
             __m128 _rows1n = (__m128)__lsx_vld(rows1p + 4, 0);
 
-            __m128 _Dn = __lsx_vfmul_s(_rows0n, _b0);
-            _Dn = __lsx_vfmadd_s(_b1, _rows1n, _Dn);
+            __m128 _Dpn = __lsx_vfmul_s(_rows0n, _b0);
+            _Dpn = __lsx_vfmadd_s(_b1, _rows1n, _Dpn);
 
-            __lsx_vst(_Dn, Dp + 4, 0);
+            __lsx_vst(_Dpn, Dp + 4, 0);
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/loongarch/interp_bilinear_pack4.h b/src/layer/loongarch/interp_bilinear_pack4.h
index 2cfb138a1cb..3702b2571fa 100644
--- a/src/layer/loongarch/interp_bilinear_pack4.h
+++ b/src/layer/loongarch/interp_bilinear_pack4.h
@@ -109,9 +109,9 @@ static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha,
         {
             __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
             __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
-            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
-            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
-            __lsx_vst(_D, Dp, 0);
+            __m128 _Dp = __lsx_vfmul_s(_rows0, _b0);
+            _Dp = __lsx_vfmadd_s(_b1, _rows1, _Dp);
+            __lsx_vst(_Dp, Dp, 0);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/mips/interp_bicubic_pack4.h b/src/layer/mips/interp_bicubic_pack4.h
index 4c80a863fa7..22a07e1646b 100644
--- a/src/layer/mips/interp_bicubic_pack4.h
+++ b/src/layer/mips/interp_bicubic_pack4.h
@@ -268,11 +268,11 @@ static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, i
             v4f32 _rows1 = (v4f32)__msa_ld_w(rows1p, 0);
             v4f32 _rows2 = (v4f32)__msa_ld_w(rows2p, 0);
             v4f32 _rows3 = (v4f32)__msa_ld_w(rows3p, 0);
-            v4f32 _D = __msa_fmul_w(_rows0, _b0);
-            _D = __msa_fmadd_w(_D, _rows1, _b1);
-            _D = __msa_fmadd_w(_D, _rows2, _b2);
-            _D = __msa_fmadd_w(_D, _rows3, _b3);
-            __msa_st_w((v4i32)_D, Dp, 0);
+            v4f32 _Dp = __msa_fmul_w(_rows0, _b0);
+            _Dp = __msa_fmadd_w(_Dp, _rows1, _b1);
+            _Dp = __msa_fmadd_w(_Dp, _rows2, _b2);
+            _Dp = __msa_fmadd_w(_Dp, _rows3, _b3);
+            __msa_st_w((v4i32)_Dp, Dp, 0);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/mips/interp_bilinear.h b/src/layer/mips/interp_bilinear.h
index 787eb927af8..eba05714961 100644
--- a/src/layer/mips/interp_bilinear.h
+++ b/src/layer/mips/interp_bilinear.h
@@ -143,18 +143,18 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
             v4f32 _rows0 = (v4f32)__msa_ld_w(rows0p, 0);
             v4f32 _rows1 = (v4f32)__msa_ld_w(rows1p, 0);
 
-            v4f32 _D = __msa_fmul_w(_rows0, _b0);
-            _D = __msa_fmadd_w(_D, _rows1, _b1);
+            v4f32 _Dp = __msa_fmul_w(_rows0, _b0);
+            _Dp = __msa_fmadd_w(_Dp, _rows1, _b1);
 
-            __msa_st_w((v4i32)_D, Dp, 0);
+            __msa_st_w((v4i32)_Dp, Dp, 0);
 
             v4f32 _rows0n = (v4f32)__msa_ld_w(rows0p + 4, 0);
             v4f32 _rows1n = (v4f32)__msa_ld_w(rows1p + 4, 0);
 
-            v4f32 _Dn = __msa_fmul_w(_rows0n, _b0);
-            _Dn = __msa_fmadd_w(_Dn, _rows1n, _b1);
+            v4f32 _Dpn = __msa_fmul_w(_rows0n, _b0);
+            _Dpn = __msa_fmadd_w(_Dpn, _rows1n, _b1);
 
-            __msa_st_w((v4i32)_Dn, Dp + 4, 0);
+            __msa_st_w((v4i32)_Dpn, Dp + 4, 0);
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/mips/interp_bilinear_pack4.h b/src/layer/mips/interp_bilinear_pack4.h
index 992216403f8..500e8648a11 100644
--- a/src/layer/mips/interp_bilinear_pack4.h
+++ b/src/layer/mips/interp_bilinear_pack4.h
@@ -109,9 +109,9 @@ static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha,
         {
             v4f32 _rows0 = (v4f32)__msa_ld_w(rows0p, 0);
             v4f32 _rows1 = (v4f32)__msa_ld_w(rows1p, 0);
-            v4f32 _D = __msa_fmul_w(_rows0, _b0);
-            _D = __msa_fmadd_w(_D, _rows1, _b1);
-            __msa_st_w((v4i32)_D, Dp, 0);
+            v4f32 _Dp = __msa_fmul_w(_rows0, _b0);
+            _Dp = __msa_fmadd_w(_Dp, _rows1, _b1);
+            __msa_st_w((v4i32)_Dp, Dp, 0);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/riscv/interp_bicubic_packn.h b/src/layer/riscv/interp_bicubic_packn.h
index 4c4eb869c43..b19af95f0f8 100644
--- a/src/layer/riscv/interp_bicubic_packn.h
+++ b/src/layer/riscv/interp_bicubic_packn.h
@@ -226,9 +226,9 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, i
             vfloat32m1_t _rows2 = vle32_v_f32m1(rows2p, vl);
             vfloat32m1_t _rows3 = vle32_v_f32m1(rows3p, vl);
 
-            vfloat32m1_t _D = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl);
+            vfloat32m1_t _Dp = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl);
 
-            vse32_v_f32m1(Dp, _D, vl);
+            vse32_v_f32m1(Dp, _Dp, vl);
 
             Dp += packn;
             rows0p += packn;
diff --git a/src/layer/riscv/interp_bicubic_packn_fp16s.h b/src/layer/riscv/interp_bicubic_packn_fp16s.h
index ff2284552b7..f87d5bb5a4c 100644
--- a/src/layer/riscv/interp_bicubic_packn_fp16s.h
+++ b/src/layer/riscv/interp_bicubic_packn_fp16s.h
@@ -226,9 +226,9 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al
             vfloat32m2_t _rows2 = vle32_v_f32m2(rows2p, vl);
             vfloat32m2_t _rows3 = vle32_v_f32m2(rows3p, vl);
 
-            vfloat32m2_t _D = vfmacc_vf_f32m2(vfmacc_vf_f32m2(vfmacc_vf_f32m2(vfmul_vf_f32m2(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl);
+            vfloat32m2_t _Dp = vfmacc_vf_f32m2(vfmacc_vf_f32m2(vfmacc_vf_f32m2(vfmul_vf_f32m2(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl);
 
-            vse16_v_f16m1(Dp, vfncvt_f_f_w_f16m1(_D, vl), vl);
+            vse16_v_f16m1(Dp, vfncvt_f_f_w_f16m1(_Dp, vl), vl);
 
             Dp += packn;
             rows0p += packn;
@@ -455,9 +455,9 @@ static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16*
             vfloat16m1_t _rows2 = vle16_v_f16m1(rows2p, vl);
             vfloat16m1_t _rows3 = vle16_v_f16m1(rows3p, vl);
 
-            vfloat16m1_t _D = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl);
+            vfloat16m1_t _Dp = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl);
 
-            vse16_v_f16m1(Dp, _D, vl);
+            vse16_v_f16m1(Dp, _Dp, vl);
 
             Dp += packn;
             rows0p += packn;
diff --git a/src/layer/riscv/interp_bilinear.h b/src/layer/riscv/interp_bilinear.h
index 0f6338d7310..ffd613a6573 100644
--- a/src/layer/riscv/interp_bilinear.h
+++ b/src/layer/riscv/interp_bilinear.h
@@ -200,9 +200,9 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
             vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl);
             vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl);
 
-            vfloat32m8_t _D = vfmacc_vf_f32m8(vfmul_vf_f32m8(_rows0, b0, vl), b1, _rows1, vl);
+            vfloat32m8_t _Dp = vfmacc_vf_f32m8(vfmul_vf_f32m8(_rows0, b0, vl), b1, _rows1, vl);
 
-            vse32_v_f32m8(Dp, _D, vl);
+            vse32_v_f32m8(Dp, _Dp, vl);
 
             Dp += vl;
             rows0p += vl;
diff --git a/src/layer/riscv/interp_bilinear_fp16s.h b/src/layer/riscv/interp_bilinear_fp16s.h
index cd61af6efac..318f36f8ab8 100644
--- a/src/layer/riscv/interp_bilinear_fp16s.h
+++ b/src/layer/riscv/interp_bilinear_fp16s.h
@@ -136,9 +136,9 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha,
             vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl);
             vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl);
 
-            vfloat32m8_t _D = vfmacc_vf_f32m8(vfmul_vf_f32m8(_rows0, b0, vl), b1, _rows1, vl);
+            vfloat32m8_t _Dp = vfmacc_vf_f32m8(vfmul_vf_f32m8(_rows0, b0, vl), b1, _rows1, vl);
 
-            vse16_v_f16m4(Dp, vfncvt_f_f_w_f16m4(_D, vl), vl);
+            vse16_v_f16m4(Dp, vfncvt_f_f_w_f16m4(_Dp, vl), vl);
 
             Dp += vl;
             rows0p += vl;
@@ -237,9 +237,9 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha
             vfloat16m8_t _rows0 = vle16_v_f16m8(rows0p, vl);
             vfloat16m8_t _rows1 = vle16_v_f16m8(rows1p, vl);
 
-            vfloat16m8_t _D = vfmacc_vf_f16m8(vfmul_vf_f16m8(_rows0, b0, vl), b1, _rows1, vl);
+            vfloat16m8_t _Dp = vfmacc_vf_f16m8(vfmul_vf_f16m8(_rows0, b0, vl), b1, _rows1, vl);
 
-            vse16_v_f16m8(Dp, _D, vl);
+            vse16_v_f16m8(Dp, _Dp, vl);
 
             Dp += vl;
             rows0p += vl;
diff --git a/src/layer/riscv/interp_bilinear_packn.h b/src/layer/riscv/interp_bilinear_packn.h
index 9dffc01bf30..725651dd56f 100644
--- a/src/layer/riscv/interp_bilinear_packn.h
+++ b/src/layer/riscv/interp_bilinear_packn.h
@@ -106,9 +106,9 @@ static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha,
             vfloat32m1_t _rows0 = vle32_v_f32m1(rows0p, vl);
             vfloat32m1_t _rows1 = vle32_v_f32m1(rows1p, vl);
 
-            vfloat32m1_t _D = vfmacc_vf_f32m1(vfmul_vf_f32m1(_rows0, b0, vl), b1, _rows1, vl);
+            vfloat32m1_t _Dp = vfmacc_vf_f32m1(vfmul_vf_f32m1(_rows0, b0, vl), b1, _rows1, vl);
 
-            vse32_v_f32m1(Dp, _D, vl);
+            vse32_v_f32m1(Dp, _Dp, vl);
 
             Dp += packn;
             rows0p += packn;
diff --git a/src/layer/riscv/interp_bilinear_packn_fp16s.h b/src/layer/riscv/interp_bilinear_packn_fp16s.h
index dfe02c00d1b..bfa239431f1 100644
--- a/src/layer/riscv/interp_bilinear_packn_fp16s.h
+++ b/src/layer/riscv/interp_bilinear_packn_fp16s.h
@@ -106,9 +106,9 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a
             vfloat32m2_t _rows0 = vle32_v_f32m2(rows0p, vl);
             vfloat32m2_t _rows1 = vle32_v_f32m2(rows1p, vl);
 
-            vfloat32m2_t _D = vfmacc_vf_f32m2(vfmul_vf_f32m2(_rows0, b0, vl), b1, _rows1, vl);
+            vfloat32m2_t _Dp = vfmacc_vf_f32m2(vfmul_vf_f32m2(_rows0, b0, vl), b1, _rows1, vl);
 
-            vse16_v_f16m1(Dp, vfncvt_f_f_w_f16m1(_D, vl), vl);
+            vse16_v_f16m1(Dp, vfncvt_f_f_w_f16m1(_Dp, vl), vl);
 
             Dp += packn;
             rows0p += packn;
@@ -213,9 +213,9 @@ static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16*
             vfloat16m1_t _rows0 = vle16_v_f16m1(rows0p, vl);
             vfloat16m1_t _rows1 = vle16_v_f16m1(rows1p, vl);
 
-            vfloat16m1_t _D = vfmacc_vf_f16m1(vfmul_vf_f16m1(_rows0, b0, vl), b1, _rows1, vl);
+            vfloat16m1_t _Dp = vfmacc_vf_f16m1(vfmul_vf_f16m1(_rows0, b0, vl), b1, _rows1, vl);
 
-            vse16_v_f16m1(Dp, _D, vl);
+            vse16_v_f16m1(Dp, _Dp, vl);
 
             Dp += packn;
             rows0p += packn;
diff --git a/src/layer/x86/interp_bicubic.h b/src/layer/x86/interp_bicubic.h
index 344996cb6b0..e327df71c28 100644
--- a/src/layer/x86/interp_bicubic.h
+++ b/src/layer/x86/interp_bicubic.h
@@ -264,11 +264,11 @@ static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xo
             __m256 _rows1 = _mm256_loadu_ps(rows1p);
             __m256 _rows2 = _mm256_loadu_ps(rows2p);
             __m256 _rows3 = _mm256_loadu_ps(rows3p);
-            __m256 _D = _mm256_mul_ps(_rows0, _b0_256);
-            _D = _mm256_comp_fmadd_ps(_rows1, _b1_256, _D);
-            _D = _mm256_comp_fmadd_ps(_rows2, _b2_256, _D);
-            _D = _mm256_comp_fmadd_ps(_rows3, _b3_256, _D);
-            _mm256_storeu_ps(Dp, _D);
+            __m256 _Dp = _mm256_mul_ps(_rows0, _b0_256);
+            _Dp = _mm256_comp_fmadd_ps(_rows1, _b1_256, _Dp);
+            _Dp = _mm256_comp_fmadd_ps(_rows2, _b2_256, _Dp);
+            _Dp = _mm256_comp_fmadd_ps(_rows3, _b3_256, _Dp);
+            _mm256_storeu_ps(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
@@ -287,11 +287,11 @@ static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xo
             __m128 _rows1 = _mm_loadu_ps(rows1p);
             __m128 _rows2 = _mm_loadu_ps(rows2p);
             __m128 _rows3 = _mm_loadu_ps(rows3p);
-            __m128 _D = _mm_mul_ps(_rows0, _b0_128);
-            _D = _mm_comp_fmadd_ps(_rows1, _b1_128, _D);
-            _D = _mm_comp_fmadd_ps(_rows2, _b2_128, _D);
-            _D = _mm_comp_fmadd_ps(_rows3, _b3_128, _D);
-            _mm_storeu_ps(Dp, _D);
+            __m128 _Dp = _mm_mul_ps(_rows0, _b0_128);
+            _Dp = _mm_comp_fmadd_ps(_rows1, _b1_128, _Dp);
+            _Dp = _mm_comp_fmadd_ps(_rows2, _b2_128, _Dp);
+            _Dp = _mm_comp_fmadd_ps(_rows3, _b3_128, _Dp);
+            _mm_storeu_ps(Dp, _Dp);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/x86/interp_bicubic_pack16.h b/src/layer/x86/interp_bicubic_pack16.h
index a288959d50a..4c6eb14e5ee 100644
--- a/src/layer/x86/interp_bicubic_pack16.h
+++ b/src/layer/x86/interp_bicubic_pack16.h
@@ -268,11 +268,11 @@ static void resize_bicubic_image_pack16(const Mat& src, Mat& dst, float* alpha,
             __m512 _rows1 = _mm512_load_ps(rows1p);
             __m512 _rows2 = _mm512_load_ps(rows2p);
             __m512 _rows3 = _mm512_load_ps(rows3p);
-            __m512 _D = _mm512_mul_ps(_rows0, _b0);
-            _D = _mm512_fmadd_ps(_rows1, _b1, _D);
-            _D = _mm512_fmadd_ps(_rows2, _b2, _D);
-            _D = _mm512_fmadd_ps(_rows3, _b3, _D);
-            _mm512_store_ps(Dp, _D);
+            __m512 _Dp = _mm512_mul_ps(_rows0, _b0);
+            _Dp = _mm512_fmadd_ps(_rows1, _b1, _Dp);
+            _Dp = _mm512_fmadd_ps(_rows2, _b2, _Dp);
+            _Dp = _mm512_fmadd_ps(_rows3, _b3, _Dp);
+            _mm512_store_ps(Dp, _Dp);
 
             Dp += 16;
             rows0p += 16;
diff --git a/src/layer/x86/interp_bicubic_pack4.h b/src/layer/x86/interp_bicubic_pack4.h
index 5718bdc8784..de165a79f69 100644
--- a/src/layer/x86/interp_bicubic_pack4.h
+++ b/src/layer/x86/interp_bicubic_pack4.h
@@ -268,11 +268,11 @@ static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, i
             __m128 _rows1 = _mm_load_ps(rows1p);
             __m128 _rows2 = _mm_load_ps(rows2p);
             __m128 _rows3 = _mm_load_ps(rows3p);
-            __m128 _D = _mm_mul_ps(_rows0, _b0);
-            _D = _mm_comp_fmadd_ps(_rows1, _b1, _D);
-            _D = _mm_comp_fmadd_ps(_rows2, _b2, _D);
-            _D = _mm_comp_fmadd_ps(_rows3, _b3, _D);
-            _mm_store_ps(Dp, _D);
+            __m128 _Dp = _mm_mul_ps(_rows0, _b0);
+            _Dp = _mm_comp_fmadd_ps(_rows1, _b1, _Dp);
+            _Dp = _mm_comp_fmadd_ps(_rows2, _b2, _Dp);
+            _Dp = _mm_comp_fmadd_ps(_rows3, _b3, _Dp);
+            _mm_store_ps(Dp, _Dp);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/x86/interp_bicubic_pack8.h b/src/layer/x86/interp_bicubic_pack8.h
index c70bc7b1501..e8cea178e39 100644
--- a/src/layer/x86/interp_bicubic_pack8.h
+++ b/src/layer/x86/interp_bicubic_pack8.h
@@ -268,11 +268,11 @@ static void resize_bicubic_image_pack8(const Mat& src, Mat& dst, float* alpha, i
             __m256 _rows1 = _mm256_load_ps(rows1p);
             __m256 _rows2 = _mm256_load_ps(rows2p);
             __m256 _rows3 = _mm256_load_ps(rows3p);
-            __m256 _D = _mm256_mul_ps(_rows0, _b0);
-            _D = _mm256_comp_fmadd_ps(_rows1, _b1, _D);
-            _D = _mm256_comp_fmadd_ps(_rows2, _b2, _D);
-            _D = _mm256_comp_fmadd_ps(_rows3, _b3, _D);
-            _mm256_store_ps(Dp, _D);
+            __m256 _Dp = _mm256_mul_ps(_rows0, _b0);
+            _Dp = _mm256_comp_fmadd_ps(_rows1, _b1, _Dp);
+            _Dp = _mm256_comp_fmadd_ps(_rows2, _b2, _Dp);
+            _Dp = _mm256_comp_fmadd_ps(_rows3, _b3, _Dp);
+            _mm256_store_ps(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/x86/interp_bilinear.h b/src/layer/x86/interp_bilinear.h
index c8430b43d0c..39d2432a0c1 100644
--- a/src/layer/x86/interp_bilinear.h
+++ b/src/layer/x86/interp_bilinear.h
@@ -137,9 +137,9 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
         {
             __m256 _rows0 = _mm256_loadu_ps(rows0p);
             __m256 _rows1 = _mm256_loadu_ps(rows1p);
-            __m256 _D = _mm256_mul_ps(_rows0, _b0_256);
-            _D = _mm256_comp_fmadd_ps(_rows1, _b1_256, _D);
-            _mm256_storeu_ps(Dp, _D);
+            __m256 _Dp = _mm256_mul_ps(_rows0, _b0_256);
+            _Dp = _mm256_comp_fmadd_ps(_rows1, _b1_256, _Dp);
+            _mm256_storeu_ps(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
@@ -152,9 +152,9 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
         {
             __m128 _rows0 = _mm_loadu_ps(rows0p);
             __m128 _rows1 = _mm_loadu_ps(rows1p);
-            __m128 _D = _mm_mul_ps(_rows0, _b0_128);
-            _D = _mm_comp_fmadd_ps(_rows1, _b1_128, _D);
-            _mm_storeu_ps(Dp, _D);
+            __m128 _Dp = _mm_mul_ps(_rows0, _b0_128);
+            _Dp = _mm_comp_fmadd_ps(_rows1, _b1_128, _Dp);
+            _mm_storeu_ps(Dp, _Dp);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/x86/interp_bilinear_pack16.h b/src/layer/x86/interp_bilinear_pack16.h
index d33ae0557d5..5157bbe5761 100644
--- a/src/layer/x86/interp_bilinear_pack16.h
+++ b/src/layer/x86/interp_bilinear_pack16.h
@@ -109,9 +109,9 @@ static void resize_bilinear_image_pack16(const Mat& src, Mat& dst, float* alpha,
         {
             __m512 _rows0 = _mm512_load_ps(rows0p);
             __m512 _rows1 = _mm512_load_ps(rows1p);
-            __m512 _D = _mm512_mul_ps(_rows0, _b0);
-            _D = _mm512_fmadd_ps(_rows1, _b1, _D);
-            _mm512_store_ps(Dp, _D);
+            __m512 _Dp = _mm512_mul_ps(_rows0, _b0);
+            _Dp = _mm512_fmadd_ps(_rows1, _b1, _Dp);
+            _mm512_store_ps(Dp, _Dp);
 
             Dp += 16;
             rows0p += 16;
diff --git a/src/layer/x86/interp_bilinear_pack4.h b/src/layer/x86/interp_bilinear_pack4.h
index 4f50caf7430..3f1c7a1a5c8 100644
--- a/src/layer/x86/interp_bilinear_pack4.h
+++ b/src/layer/x86/interp_bilinear_pack4.h
@@ -109,9 +109,9 @@ static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha,
         {
             __m128 _rows0 = _mm_load_ps(rows0p);
             __m128 _rows1 = _mm_load_ps(rows1p);
-            __m128 _D = _mm_mul_ps(_rows0, _b0);
-            _D = _mm_comp_fmadd_ps(_rows1, _b1, _D);
-            _mm_store_ps(Dp, _D);
+            __m128 _Dp = _mm_mul_ps(_rows0, _b0);
+            _Dp = _mm_comp_fmadd_ps(_rows1, _b1, _Dp);
+            _mm_store_ps(Dp, _Dp);
 
             Dp += 4;
             rows0p += 4;
diff --git a/src/layer/x86/interp_bilinear_pack8.h b/src/layer/x86/interp_bilinear_pack8.h
index 5199d479055..41e7071db89 100644
--- a/src/layer/x86/interp_bilinear_pack8.h
+++ b/src/layer/x86/interp_bilinear_pack8.h
@@ -109,9 +109,9 @@ static void resize_bilinear_image_pack8(const Mat& src, Mat& dst, float* alpha,
         {
             __m256 _rows0 = _mm256_load_ps(rows0p);
             __m256 _rows1 = _mm256_load_ps(rows1p);
-            __m256 _D = _mm256_mul_ps(_rows0, _b0);
-            _D = _mm256_comp_fmadd_ps(_rows1, _b1, _D);
-            _mm256_store_ps(Dp, _D);
+            __m256 _Dp = _mm256_mul_ps(_rows0, _b0);
+            _Dp = _mm256_comp_fmadd_ps(_rows1, _b1, _Dp);
+            _mm256_store_ps(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
diff --git a/src/layer/x86/lstm_x86.cpp b/src/layer/x86/lstm_x86.cpp
index e0d4c73f2fc..21f528361e2 100644
--- a/src/layer/x86/lstm_x86.cpp
+++ b/src/layer/x86/lstm_x86.cpp
@@ -474,24 +474,24 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
 
             _MM_TRANSPOSE4_PS(_IFOG_4x4_0, _IFOG_4x4_1, _IFOG_4x4_2, _IFOG_4x4_3);
 
-            __m128 _I = sigmoid_sse(_IFOG_4x4_0);
-            __m128 _F = sigmoid_sse(_IFOG_4x4_1);
-            __m128 _O = sigmoid_sse(_IFOG_4x4_2);
-            __m128 _G = tanh_sse(_IFOG_4x4_3);
+            __m128 _lstm_I = sigmoid_sse(_IFOG_4x4_0);
+            __m128 _lstm_F = sigmoid_sse(_IFOG_4x4_1);
+            __m128 _lstm_O = sigmoid_sse(_IFOG_4x4_2);
+            __m128 _lstm_G = tanh_sse(_IFOG_4x4_3);
 
-            __m128 _cell2 = _mm_add_ps(_mm_mul_ps(_F, _mm_loadu_ps(cell_ptr + q)), _mm_mul_ps(_I, _G));
-            __m128 _H = _mm_mul_ps(_O, tanh_sse(_cell2));
+            __m128 _cell2 = _mm_add_ps(_mm_mul_ps(_lstm_F, _mm_loadu_ps(cell_ptr + q)), _mm_mul_ps(_lstm_I, _lstm_G));
+            __m128 _lstm_H = _mm_mul_ps(_lstm_O, tanh_sse(_cell2));
 
             _mm_storeu_ps(cell_ptr + q, _cell2);
 
             if (num_output == hidden_size)
             {
-                _mm_storeu_ps(hidden_ptr + q, _H);
-                _mm_storeu_ps(output_data + q, _H);
+                _mm_storeu_ps(hidden_ptr + q, _lstm_H);
+                _mm_storeu_ps(output_data + q, _lstm_H);
             }
             else
             {
-                _mm_storeu_ps(tmp_hidden_ptr + q, _H);
+                _mm_storeu_ps(tmp_hidden_ptr + q, _lstm_H);
             }
         }
 #else  // __SSE2__
diff --git a/src/mat_pixel_resize.cpp b/src/mat_pixel_resize.cpp
index 555645d978b..7d171338469 100644
--- a/src/mat_pixel_resize.cpp
+++ b/src/mat_pixel_resize.cpp
@@ -229,9 +229,9 @@ void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstr
             int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
             int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
 
-            uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+            uint8x8_t _Dp = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
 
-            vst1_u8(Dp, _D);
+            vst1_u8(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
@@ -538,9 +538,9 @@ void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstr
             int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
             int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
 
-            uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+            uint8x8_t _Dp = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
 
-            vst1_u8(Dp, _D);
+            vst1_u8(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
@@ -858,9 +858,9 @@ void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstr
             int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
             int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
 
-            uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+            uint8x8_t _Dp = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
 
-            vst1_u8(Dp, _D);
+            vst1_u8(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
@@ -1158,9 +1158,9 @@ void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstr
             int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
             int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
 
-            uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+            uint8x8_t _Dp = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
 
-            vst1_u8(Dp, _D);
+            vst1_u8(Dp, _Dp);
 
             Dp += 8;
             rows0p += 8;
diff --git a/tools/pnnx/tests/test_torch_max.py b/tools/pnnx/tests/test_torch_max.py
index 78575709211..247b6bf3b3d 100644
--- a/tools/pnnx/tests/test_torch_max.py
+++ b/tools/pnnx/tests/test_torch_max.py
@@ -20,11 +20,12 @@ class Model(nn.Module):
     def __init__(self):
         super(Model, self).__init__()
 
-    def forward(self, x, y, z):
+    def forward(self, x, y, z, w):
         x, x_indices = torch.max(x, dim=1, keepdim=False)
         y = torch.max(y)
+        w = torch.max(z, w)
         z, z_indices = torch.max(z, dim=0, keepdim=True)
-        return x, x_indices, y, z, z_indices
+        return x, x_indices, y, z, z_indices, w
 
 def test():
     net = Model()
@@ -34,16 +35,17 @@ def test():
     x = torch.rand(1, 3, 16)
     y = torch.rand(1, 5, 9, 11)
     z = torch.rand(14, 8, 5, 9, 10)
+    w = torch.rand(5, 9, 10)
 
-    a = net(x, y, z)
+    a = net(x, y, z, w)
 
     # export torchscript
-    mod = torch.jit.trace(net, (x, y, z))
+    mod = torch.jit.trace(net, (x, y, z, w))
     mod.save("test_torch_max.pt")
 
     # torchscript to pnnx
     import os
-    os.system("../src/pnnx test_torch_max.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+    os.system("../src/pnnx test_torch_max.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10],[5,9,10]")
 
     # pnnx inference
     import test_torch_max_pnnx
diff --git a/tools/pnnx/tests/test_torch_min.py b/tools/pnnx/tests/test_torch_min.py
index 521b6ae7a47..9419cc624b7 100644
--- a/tools/pnnx/tests/test_torch_min.py
+++ b/tools/pnnx/tests/test_torch_min.py
@@ -20,11 +20,12 @@ class Model(nn.Module):
     def __init__(self):
         super(Model, self).__init__()
 
-    def forward(self, x, y, z):
+    def forward(self, x, y, z, w):
         x, x_indices = torch.min(x, dim=1, keepdim=False)
         y = torch.min(y)
+        w = torch.min(z, w)
         z, z_indices = torch.min(z, dim=0, keepdim=True)
-        return x, x_indices, y, z, z_indices
+        return x, x_indices, y, z, z_indices, w
 
 def test():
     net = Model()
@@ -34,16 +35,17 @@ def test():
     x = torch.rand(1, 3, 16)
     y = torch.rand(1, 5, 9, 11)
     z = torch.rand(14, 8, 5, 9, 10)
+    w = torch.rand(5, 9, 10)
 
-    a = net(x, y, z)
+    a = net(x, y, z, w)
 
     # export torchscript
-    mod = torch.jit.trace(net, (x, y, z))
+    mod = torch.jit.trace(net, (x, y, z, w))
     mod.save("test_torch_min.pt")
 
     # torchscript to pnnx
     import os
-    os.system("../src/pnnx test_torch_min.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+    os.system("../src/pnnx test_torch_min.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10],[5,9,10]")
 
     # pnnx inference
     import test_torch_min_pnnx