Xtensa LSTM: (#2150)

Enabled LSTM kernel support for XTENSA target. Updated xtensa_downloads script to use the latest HiFi NN Libraries. The 8x16 unit test cases has non-zero zero_point for 16 bit output. [https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/kernels/testdata/lstm_test_data.cc#L255C1-L258C61](url) Default run for all the 8x16 unit test cases result: FAIL. This is due to non-zero output offset value. BUG=#1867
tensorflow · Jan 2, 2024 · 17d0e7f · 17d0e7f
1 parent 6576ef7
commit 17d0e7f
Show file tree

Hide file tree

Showing 13 changed files with 2,124 additions and 2,905 deletions.
diff --git a/tensorflow/lite/micro/kernels/lstm_eval_test.cc b/tensorflow/lite/micro/kernels/lstm_eval_test.cc
@@ -454,6 +454,6 @@ TF_LITE_MICRO_TEST(TestLSTMEvalInt16) {
                                        cell_state_tolerance,
                                        int16_node_contents);
 }
-
 #endif  // !defined(XTENSA)
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test.cc
@@ -150,7 +150,6 @@ void TestUnidirectionalLSTMFloat(
 TF_LITE_MICRO_TESTS_BEGIN
 // TODO(b/230666079) enable below tests for xtensa when the xtensa
 // kernel is reconciled with reference kernel
-#if !defined(XTENSA)
 TF_LITE_MICRO_TEST(TestUnidirectionalLSTMFloat) {
   const tflite::testing::LstmEvalCheckData<12, 4, 12> kernel_eval_data =
       tflite::testing::Get2X2LstmEvalCheckData();
@@ -193,5 +192,4 @@ TF_LITE_MICRO_TEST(TestUnidirectionalLSTMInt16) {
       kernel_eval_data, hidden_state_tolerance, cell_state_tolerance,
       int16_node_contents);
 }
-#endif  // !defined(XTENSA)
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc b/tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc
diff --git a/tensorflow/lite/micro/kernels/xtensa/lstm_eval.h b/tensorflow/lite/micro/kernels/xtensa/lstm_eval.h
diff --git a/tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc b/tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc
diff --git a/tensorflow/lite/micro/kernels/xtensa/lstm_shared.h b/tensorflow/lite/micro/kernels/xtensa/lstm_shared.h
diff --git a/tensorflow/lite/micro/kernels/xtensa/svdf.cc b/tensorflow/lite/micro/kernels/xtensa/svdf.cc
@@ -63,7 +63,7 @@ TfLiteStatus EvalIntegerSvdfHifi(TfLiteContext* context, TfLiteNode* node,
 #if defined(HIFI5)
   memcpy(state_ptr, state_ptr + 1, num_bytes);
 #else
-  xa_nn_memmove_16(state_ptr, state_ptr + 1, num_bytes);
+  xa_nn_memmove_16(state_ptr, state_ptr + 1, (num_bytes >> 1));
 #endif  // defined(HIFI5)
 
   // Note: no need to clear the latest activation, matmul is not accumulative.

diff --git a/tensorflow/lite/micro/kernels/xtensa/transpose_conv.cc b/tensorflow/lite/micro/kernels/xtensa/transpose_conv.cc
@@ -183,19 +183,57 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Quantized kernels use an int32 scratch buffer.
   if (input->type == kTfLiteInt8) {
     TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+#if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
+    const int stride_width = params->stride_width;
+    const int stride_height = params->stride_height;
+
+    const int input_height = SizeOfDimension(input, 1);
+    const int input_width = SizeOfDimension(input, 2);
+    const int input_depth = SizeOfDimension(input, 3);
+    const int output_height = height;
+    const int output_width = width;
+    int32_t scratch_buffer_size = 0;
+    scratch_buffer_size = xa_nn_transpose_conv_getsize(
+        input_height, input_width, input_depth, filter_height, filter_width,
+        stride_width, stride_height, output_height, output_width, num_channels,
+        PREC_SYM8S, PREC_ASYM8S);
+    TFLITE_DCHECK(context->RequestScratchBufferInArena(
+                      context, scratch_buffer_size,
+                      &(data->scratch_buffer_index)) == kTfLiteOk);
+#else  // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
     TFLITE_DCHECK(context->RequestScratchBufferInArena(
                       context,
                       GetTensorShape(output).FlatSize() * sizeof(int32_t),
                       &(data->scratch_buffer_index)) == kTfLiteOk);
+#endif
   }
 
   // Quantized 16x8 kernels use an int64 scratch buffer.
   if (input->type == kTfLiteInt16) {
     TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+#if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
+    const int stride_width = params->stride_width;
+    const int stride_height = params->stride_height;
+
+    const int input_height = SizeOfDimension(input, 1);
+    const int input_width = SizeOfDimension(input, 2);
+    const int input_depth = SizeOfDimension(input, 3);
+    const int output_height = height;
+    const int output_width = width;
+    int32_t scratch_buffer_size = 0;
+    scratch_buffer_size = xa_nn_transpose_conv_getsize(
+        input_height, input_width, input_depth, filter_height, filter_width,
+        stride_width, stride_height, output_height, output_width, num_channels,
+        PREC_SYM8S, PREC_SYM16S);
+    TFLITE_DCHECK(context->RequestScratchBufferInArena(
+                      context, scratch_buffer_size,
+                      &(data->scratch_buffer_index)) == kTfLiteOk);
+#else   // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
     TFLITE_DCHECK(context->RequestScratchBufferInArena(
                       context,
                       GetTensorShape(output).FlatSize() * sizeof(std::int64_t),
                       &(data->scratch_buffer_index)) == kTfLiteOk);
+#endif  // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
   }
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
@@ -282,6 +320,63 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8: {
       int32_t* scratch_buffer = static_cast<int32_t*>(
           context->GetScratchBuffer(context, data.scratch_buffer_index));
+#if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
+      if (bias->type == kTfLiteInt32) {
+        const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
+        const RuntimeShape& filter_shape =
+            tflite::micro::GetTensorShape(filter);
+        const RuntimeShape& output_shape =
+            tflite::micro::GetTensorShape(output);
+        const int stride_width = data.params.stride_width;
+        const int stride_height = data.params.stride_height;
+        const int pad_width = data.params.padding_values.width;
+        const int pad_height = data.params.padding_values.height;
+
+        const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+        const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+        const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+
+        const int input_height = input_shape.Dims(1);
+        const int input_width = input_shape.Dims(2);
+        const int filter_height = filter_shape.Dims(1);
+        const int filter_width = filter_shape.Dims(2);
+        const int output_height = output_shape.Dims(1);
+        const int output_width = output_shape.Dims(2);
+        const int8_t* input_data = tflite::micro::GetTensorData<int8_t>(input);
+        const int8_t* filter_data =
+            tflite::micro::GetTensorData<int8_t>(filter);
+        const int32_t* bias_data = tflite::micro::GetTensorData<int32_t>(bias);
+        int8_t* output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+        const int num_elements = output_shape.FlatSize();
+
+        for (int b = 0; b < batches; b++) {
+          xa_nn_transpose_conv_sym8sxasym8s(
+              &output_data[b * output_height * output_width * output_depth],
+              const_cast<WORD8*>(
+                  &input_data[b * input_height * input_width * input_depth]),
+              const_cast<WORD8*>(filter_data), const_cast<WORD32*>(bias_data),
+              stride_width, stride_height, pad_width, pad_height, input_depth,
+              output_depth, input_height, input_width, filter_height,
+              filter_width, output_height, output_width, num_elements / batches,
+              data.params.input_offset, data.params.output_offset,
+              data.per_channel_output_shift, data.per_channel_output_multiplier,
+              scratch_buffer);
+        }
+      } else {
+        reference_integer_ops::TransposeConv(
+            data.params, data.per_channel_output_multiplier,
+            data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int8_t>(input),
+            tflite::micro::GetTensorShape(filter),
+            tflite::micro::GetTensorData<int8_t>(filter),
+            tflite::micro::GetTensorShape(bias),
+            tflite::micro::GetTensorData<int32_t>(bias),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output),
+            tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
+      }
+#else
       reference_integer_ops::TransposeConv(
           data.params, data.per_channel_output_multiplier,
           data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
@@ -293,6 +388,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int8_t>(output),
           tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
+#endif
       break;
     }
     case kTfLiteInt16: {
@@ -319,7 +415,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorData<int16_t>(output),
             tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
       } else {
-#if defined(HIFI3) || defined(HIFI4)
+#if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
         const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
         const RuntimeShape& filter_shape =
             tflite::micro::GetTensorShape(filter);
@@ -359,9 +455,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               output_depth, input_height, input_width, filter_height,
               filter_width, output_height, output_width, num_elements / batches,
               data.per_channel_output_shift, data.per_channel_output_multiplier,
-              &scratch_buffer[b * output_height * output_width * output_depth]);
+              scratch_buffer);
         }
-#else
+#else   // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
         reference_integer_ops::TransposeConv(
             data.params, data.per_channel_output_multiplier,
             data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
@@ -373,7 +469,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorShape(output),
             tflite::micro::GetTensorData<int16_t>(output),
             tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
-#endif  // defined(HIFI3) || defined(HIFI4)
+#endif  // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
       }
       break;
     }