Merge pull request #23614 from Abdurrahheem:lstm_layout_attribute

LSTM ONNX Layout Attribute Support #23614 ### Explanation This PR contains necessary changes to support `layout` attribute. This attributes is present in [ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md#lstm) and [Torch](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#lstm) (in touch it is name as `batch_first=True`) libraries. When `layout = 1` input to LSTM layer is expected to have batch dimension first -> `[batch_size, sequence_length, features]` vs `layout = 0` - default `[sequence_length, batch_size, features]` ### Test Data Test data and data generator for PR located here [#1063](opencv/opencv_extra#1063) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
opencv · May 17, 2023 · d2143bc · d2143bc
1 parent d2618bf
commit d2143bc
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 8 deletions.
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -113,12 +113,19 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
     MatShape outTailShape;  //shape of single output sample
     MatShape outTsShape;    //shape of N output samples
 
+    enum layout_t : int {
+        SEQ_BATCH_HID = 0,
+        BATCH_SEQ_HID = 1
+    };
+
     bool useTimestampDim;
     bool produceCellOutput;
     float forgetBias, cellClip;
     bool useCellClip, usePeephole;
     bool reverse;   // If true, go in negative direction along the time axis
     bool bidirectional;  // If true, produces both forward and reversed directions along time axis
+    layout_t layout;  // If layout == BATCH_SEQ_HID, uses batch_size x seq_length x num_hidden for input and output
+                      // else uses seq_length x batch_size x num_hidden
 
     ActivationFunction f_activation;
     ActivationFunction g_activation;
@@ -198,6 +205,7 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
                 }
             }
         }
+        layout = (layout_t) params.get<int>("layout", SEQ_BATCH_HID);
         useTimestampDim = params.get<bool>("use_timestamp_dim", true);
         produceCellOutput = params.get<bool>("produce_cell_output", false);
         forgetBias = params.get<float>("forget_bias", 0.0f);
@@ -291,8 +299,13 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
         if (useTimestampDim)
         {
             CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp);
-            _numSamples = inp0[1];
-            outResShape.push_back(inp0[0]);
+            if (layout == SEQ_BATCH_HID) {
+                _numSamples = inp0[1];
+                outResShape.push_back(inp0[0]);
+            } else {
+                _numSamples = inp0[0];
+                outResShape.push_back(inp0[1]);
+            }
         }
         else
         {
@@ -349,8 +362,13 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
         if (useTimestampDim)
         {
             CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
-            numTimeStamps = inp0.size[0];
-            numSamples = inp0.size[1];
+            if (layout == SEQ_BATCH_HID){
+                numTimeStamps = inp0.size[0];
+                numSamples = inp0.size[1];
+            }else{
+                numTimeStamps = inp0.size[1];
+                numSamples = inp0.size[0];
+            }
         }
         else
         {
@@ -383,6 +401,21 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
         outputs_arr.getMatVector(output);
         internals_arr.getMatVector(internals);
 
+        if (layout == BATCH_SEQ_HID){
+            //swap axis 0 and 1 input x
+            cv::Mat tmp;
+            // Since python input is 4 dimentional and C++ input 3 dimentinal
+            // we need to proccess each differently
+            if (input[0].dims == 4){
+                // here !!!
+                CV_Assert(input[0].size[3] == 1);
+                cv::transposeND(input[0], {1, 0, 2, 3}, tmp); //back to seq_len, batch_size, hidden_size format
+            }else{
+                cv::transposeND(input[0], {1, 0, 2}, tmp); //back to seq_len, batch_size, hidden_size format
+            }
+            input[0] = tmp;
+        }
+
         Mat cOut = produceCellOutput ? output[0].clone() : Mat();
         const bool needYcTransform = !originalBlobs.empty(); // if the producer is onnx
         const int numDirs = 1 + static_cast<int>(bidirectional);
@@ -599,7 +632,12 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
                     cInternal.copyTo(cOutTs.rowRange(curRowRange));
             }
         }
-
+        // transpose to match batch first output
+        if (layout == BATCH_SEQ_HID){
+            cv::Mat tmp;
+            cv::transposeND(output[0], {1, 0, 2}, tmp);
+            output[0] = tmp;
+        }
         if (needYcTransform && produceCellOutput)
         {
             fixCellState(cOut, numDirs);
@@ -618,7 +656,13 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
 
         // permute to {0, 2, 1, 3};
         cv::Mat newCellState;
-        cv::transposeND(cOut, {0, 2, 1, 3}, newCellState);
+        // transpose to match batch first output
+        if (layout == BATCH_SEQ_HID){
+            cv::transposeND(cOut, {2, 0, 1, 3}, newCellState);
+        }
+        else{
+            cv::transposeND(cOut, {0, 2, 1, 3}, newCellState);
+        }
         cOut = newCellState;
 
         if (numDirs == 1)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1637,8 +1637,16 @@ void ONNXImporter::parseLSTM(LayerParams& layerParams, const opencv_onnx::NodePr
     CV_Assert(shapeIt != outShapes.end());
     const MatShape x_shape = shapeIt->second;
 
-    const int seq_length = x_shape[0];
-    const int batch_size = x_shape[1];
+    //if layout is 1, change batch and sequence dims
+    const int layout = layerParams.get<int>("layout", 0);
+    int batch_size, seq_length;
+    if (layout == 1){
+        batch_size = x_shape[0];
+        seq_length = x_shape[1];
+    }else{
+        seq_length = x_shape[0];
+        batch_size = x_shape[1];
+    }
     const int input_size = x_shape[2];
     const int hidden_size = layerParams.get<int>("hidden_size");
     const int num_directions = constBlobs[lstm_proto.input(1)].size[0];

diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
@@ -1393,6 +1393,20 @@ TEST_P(Test_ONNX_layers, LSTM_init_h0_c0)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
     testONNXModels("lstm_init_h0_c0", npy, 0, 0, false, false, 3);
 }
+// epsilon is larger because onnx does not match with torch/opencv exactly
+TEST_P(Test_ONNX_layers, LSTM_layout_seq)
+{
+    if(backend == DNN_BACKEND_CUDA)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
+    testONNXModels("lstm_layout_0", npy, 0.005, 0.005, false, false, 3);
+}
+// epsilon is larger because onnx does not match with torch/opencv exactly
+TEST_P(Test_ONNX_layers, LSTM_layout_batch)
+{
+    if(backend == DNN_BACKEND_CUDA)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
+    testONNXModels("lstm_layout_1", npy, 0.005, 0.005, false, false, 3);
+}
 
 TEST_P(Test_ONNX_layers, Pad2d_Unfused)
 {