From bd7ae44144d7c4768fb9143831902d87388e9dec Mon Sep 17 00:00:00 2001
From: Nuzhny007 <nuzhny@mail.ru>
Date: Tue, 10 Dec 2024 22:52:23 +0300
Subject: [PATCH 1/2] Correct aspect ratio after frame to input resizing

---
 src/Detector/tensorrt_yolo/YoloONNX.cpp       |  7 ++++---
 src/Detector/tensorrt_yolo/YoloONNX.hpp       |  7 ++++---
 src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp |  8 +++----
 src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp |  8 +++----
 .../tensorrt_yolo/YoloONNXv11_instance.hpp    |  8 +++----
 .../tensorrt_yolo/YoloONNXv11_obb.hpp         |  8 +++----
 src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp  | 21 ++++++++-----------
 src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp  | 21 ++++++++-----------
 .../tensorrt_yolo/YoloONNXv7_instance.hpp     |  8 +++----
 src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp  |  8 +++----
 .../tensorrt_yolo/YoloONNXv8_instance.hpp     |  8 +++----
 src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp |  8 +++----
 src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp  |  8 +++----
 13 files changed, 62 insertions(+), 66 deletions(-)
diff --git a/src/Detector/tensorrt_yolo/YoloONNX.cpp b/src/Detector/tensorrt_yolo/YoloONNX.cpp
index 93ca4435..690be11c 100644
--- a/src/Detector/tensorrt_yolo/YoloONNX.cpp
+++ b/src/Detector/tensorrt_yolo/YoloONNX.cpp
@@ -326,8 +326,9 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector<cv::Mat>& sampleImages)
         }
     }
 
-#if 0
+    m_resizedROI = cv::Rect(0, 0, inputW, inputH);
 
+#if 1
     // resize the DsImage with scale
     const float imgHeight = static_cast<float>(sampleImages[0].rows);
     const float imgWidth = static_cast<float>(sampleImages[0].cols);
@@ -351,7 +352,7 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector<cv::Mat>& sampleImages)
     assert(2 * yOffset + resizeH == inputH);
 
     cv::Size scaleSize(inputW, inputH);
-    cv::Rect roiRect(xOffset, yOffset, resizeW, resizeH);
+    m_resizedROI = cv::Rect(xOffset, yOffset, resizeW, resizeH);
 
     if (m_resizedBatch.size() < sampleImages.size())
         m_resizedBatch.resize(sampleImages.size());
@@ -361,7 +362,7 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector<cv::Mat>& sampleImages)
     {
         if (m_resizedBatch[b].size() != scaleSize)
             m_resizedBatch[b] = cv::Mat(scaleSize, sampleImages[b].type(), cv::Scalar::all(128));
-        cv::resize(sampleImages[b], cv::Mat(m_resizedBatch[b], roiRect), roiRect.size(), 0, 0, cv::INTER_LINEAR);
+        cv::resize(sampleImages[b], cv::Mat(m_resizedBatch[b], m_resizedROI), m_resizedROI.size(), 0, 0, cv::INTER_LINEAR);
         cv::split(m_resizedBatch[b], m_inputChannels[b]);
         std::swap(m_inputChannels[b][0], m_inputChannels[b][2]);
     }
diff --git a/src/Detector/tensorrt_yolo/YoloONNX.hpp b/src/Detector/tensorrt_yolo/YoloONNX.hpp
index 8152d166..ffd2f0bf 100644
--- a/src/Detector/tensorrt_yolo/YoloONNX.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNX.hpp
@@ -79,9 +79,10 @@ class YoloONNX
     size_t GetNumClasses() const;
 
 protected:
-    SampleYoloParams m_params; //!< The parameters for the sample.
-    nvinfer1::Dims m_inputDims; //!< The dimensions of the input to the network.
-    std::vector<nvinfer1::Dims> m_outpuDims; //!< The dimensions of the input to the network.
+    SampleYoloParams m_params;               //!< The parameters for the sample
+    nvinfer1::Dims m_inputDims;              //!< The dimensions of the input to the network
+    std::vector<nvinfer1::Dims> m_outpuDims; //!< The dimensions of the input to the network
+    cv::Rect m_resizedROI;                   //!< Input frame resized into input dimensions with the frame aspect ratio
 
     virtual std::vector<tensor_rt::Result> GetResult(size_t imgIdx, int keep_topk, const std::vector<float*>& outputs, cv::Size frameSize) = 0;
 
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp
index 137dbf8e..648e420f 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp
@@ -20,8 +20,8 @@ class YOLOv10_bb_onnx : public YoloONNX
 		//0: name: images, size: 1x3x640x640
 		//1: name: output0, size: 1x300x6
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -51,8 +51,8 @@ class YOLOv10_bb_onnx : public YoloONNX
 			//if (i == 0)
 			//	std::cout << i << ": " << output[k + 0] << " " << output[k + 1] << " " << output[k + 2] << " " << output[k + 3] << " " << output[k + 4] << " " << output[k + 5] << std::endl;
 
-			float x = fw * output[k + 0];
-			float y = fh * output[k + 1];
+			float x = fw * (output[k + 0] - m_resizedROI.x);
+			float y = fh * (output[k + 1] - m_resizedROI.y);
 			float width = fw * (output[k + 2] - output[k + 0]);
 			float height = fh * (output[k + 3] - output[k + 1]);
 			float objectConf = output[k + 4];
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp
index 9103bfa6..654f9ea4 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp
@@ -20,8 +20,8 @@ class YOLOv11_bb_onnx : public YoloONNX
 		//0: name: images, size: 1x3x640x640
 		//1: name: output0, size: 1x84x8400
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -88,8 +88,8 @@ class YOLOv11_bb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height));
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp
index 54fc6b01..30261daf 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp
@@ -17,8 +17,8 @@ class YOLOv11_instance_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		size_t outInd = (outputs.size() == 0) ? 1 : 0;
 		size_t segInd = (outputs.size() == 0) ? 0 : 1;
@@ -155,8 +155,8 @@ class YOLOv11_instance_onnx : public YoloONNX
 			if (objectConf >= m_params.confThreshold)
 			{
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp
index 7c2b98ce..c35b16c9 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp
@@ -22,8 +22,8 @@ class YOLOv11_obb_onnx : public YoloONNX
 		//20: 15 DOTA classes + x + y + w + h + a
 		constexpr int shapeDataSize = 5;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -96,8 +96,8 @@ class YOLOv11_obb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height)
-				float cx = fw * output[k];
-				float cy = fh * output[k + 1];
+				float cx = fw * (output[k] - m_resizedROI.x);
+				float cy = fh * (output[k + 1] - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				float angle = 180.f * output[k + nc + shapeDataSize - 1] / M_PI;
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp
index 4d6a6268..ef82ca23 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp
@@ -17,6 +17,9 @@ class YOLOv6_bb_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
+
 		if (outputs.size() == 4)
 		{
 			auto dets = reinterpret_cast<int*>(outputs[0]);
@@ -26,9 +29,6 @@ class YOLOv6_bb_onnx : public YoloONNX
 
 			int objectsCount = m_outpuDims[1].d[1];
 
-			const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-			const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
-
 			//std::cout << "Dets[" << imgIdx << "] = " << dets[imgIdx] << ", objectsCount = " << objectsCount << std::endl;
 
 			const size_t step1 = imgIdx * objectsCount;
@@ -41,8 +41,8 @@ class YOLOv6_bb_onnx : public YoloONNX
 				int classId = classes[i + step1];
 				if (class_conf >= m_params.confThreshold)
 				{
-					float x = fw * boxes[k + 0 + step2];
-					float y = fh * boxes[k + 1 + step2];
+					float x = fw * (boxes[k + 0 + step2] - m_resizedROI.x);
+					float y = fh * (boxes[k + 1 + step2] - m_resizedROI.y);
 					float width = fw * boxes[k + 2 + step2] - x;
 					float height = fh * boxes[k + 3 + step2] - y;
 
@@ -57,9 +57,6 @@ class YOLOv6_bb_onnx : public YoloONNX
 		}
 		else if (outputs.size() == 1)
 		{
-			const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-			const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
-
 			auto output = outputs[0];
 
 			size_t ncInd = 2;
@@ -96,8 +93,8 @@ class YOLOv6_bb_onnx : public YoloONNX
 					int classId = cvRound(output[k + 5]);
 					if (class_conf >= m_params.confThreshold)
 					{
-						float x = fw * output[k + 1];
-						float y = fh * output[k + 2];
+						float x = fw * (output[k + 1] - m_resizedROI.x);
+						float y = fh * (output[k + 2] - m_resizedROI.y);
 						float width = fw * (output[k + 3] - output[k + 1]);
 						float height = fh * (output[k + 4] - output[k + 2]);
 
@@ -150,8 +147,8 @@ class YOLOv6_bb_onnx : public YoloONNX
 					if (object_conf >= m_params.confThreshold)
 					{
 						// (center x, center y, width, height) to (x, y, w, h)
-						float x = fw * (output[k] - output[k + 2] / 2);
-						float y = fh * (output[k + 1] - output[k + 3] / 2);
+						float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+						float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 						float width = fw * output[k + 2];
 						float height = fh * output[k + 3];
 
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp
index 78a383b4..946daf4c 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp
@@ -17,6 +17,9 @@ class YOLOv7_bb_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
+
 		if (outputs.size() == 4)
 		{
 			auto dets = reinterpret_cast<int*>(outputs[0]);
@@ -26,9 +29,6 @@ class YOLOv7_bb_onnx : public YoloONNX
 
 			int objectsCount = m_outpuDims[1].d[1];
 
-			const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-			const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
-
 			//std::cout << "Dets[" << imgIdx << "] = " << dets[imgIdx] << ", objectsCount = " << objectsCount << std::endl;
 
 			const size_t step1 = imgIdx * objectsCount;
@@ -41,8 +41,8 @@ class YOLOv7_bb_onnx : public YoloONNX
 				int classId = classes[i + step1];
 				if (class_conf >= m_params.confThreshold)
 				{
-					float x = fw * boxes[k + 0 + step2];
-					float y = fh * boxes[k + 1 + step2];
+					float x = fw * (boxes[k + 0 + step2] - m_resizedROI.x);
+					float y = fh * (boxes[k + 1 + step2] - m_resizedROI.y);
 					float width = fw * boxes[k + 2 + step2] - x;
 					float height = fh * boxes[k + 3 + step2] - y;
 
@@ -57,9 +57,6 @@ class YOLOv7_bb_onnx : public YoloONNX
 		}
 		else if (outputs.size() == 1)
 		{
-			const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-			const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
-
 			auto output = outputs[0];
 
 			size_t ncInd = 2;
@@ -96,8 +93,8 @@ class YOLOv7_bb_onnx : public YoloONNX
 					int classId = cvRound(output[k + 5]);
 					if (class_conf >= m_params.confThreshold)
 					{
-						float x = fw * output[k + 1];
-						float y = fh * output[k + 2];
+						float x = fw * (output[k + 1] - m_resizedROI.x);
+						float y = fh * (output[k + 2] - m_resizedROI.y);
 						float width = fw * (output[k + 3] - output[k + 1]);
 						float height = fh * (output[k + 4] - output[k + 2]);
 
@@ -150,8 +147,8 @@ class YOLOv7_bb_onnx : public YoloONNX
 					if (object_conf >= m_params.confThreshold)
 					{
 						// (center x, center y, width, height) to (x, y, w, h)
-						float x = fw * (output[k] - output[k + 2] / 2);
-						float y = fh * (output[k + 1] - output[k + 3] / 2);
+						float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+						float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 						float width = fw * output[k + 2];
 						float height = fh * output[k + 3];
 
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp
index 73a5d671..1f962392 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp
@@ -18,8 +18,8 @@ class YOLOv7_instance_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		size_t outInd = (outputs.size() == 0) ? 0 : 1;
 		size_t segInd = (outputs.size() == 0) ? 1 : 0;
@@ -123,8 +123,8 @@ class YOLOv7_instance_onnx : public YoloONNX
 			if (object_conf >= m_params.confThreshold)
 			{
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp
index 9a21e397..4e24d2f5 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp
@@ -20,8 +20,8 @@ class YOLOv8_bb_onnx : public YoloONNX
 		//0: name: images, size: 1x3x640x640
 		//1: name: output0, size: 1x84x8400
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -88,8 +88,8 @@ class YOLOv8_bb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height));
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp
index dbdf20fd..dff444b8 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp
@@ -17,8 +17,8 @@ class YOLOv8_instance_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		size_t outInd = (outputs.size() == 0) ? 0 : 1;
 		size_t segInd = (outputs.size() == 0) ? 1 : 0;
@@ -155,8 +155,8 @@ class YOLOv8_instance_onnx : public YoloONNX
 			if (objectConf >= m_params.confThreshold)
 			{
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp
index 4c39c5a4..e9a232dd 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp
@@ -22,8 +22,8 @@ class YOLOv8_obb_onnx : public YoloONNX
 		//20: 15 DOTA classes + x + y + w + h + a
 		constexpr int shapeDataSize = 5;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -96,8 +96,8 @@ class YOLOv8_obb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height)
-				float cx = fw * output[k];
-				float cy = fh * output[k + 1];
+				float cx = fw * (output[k] - m_resizedROI.x);
+				float cy = fh * (output[k + 1] - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				float angle = 180.f * output[k + nc + shapeDataSize - 1] / M_PI;
diff --git a/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp
index f4c99ebd..a934877b 100644
--- a/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp
+++ b/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp
@@ -22,8 +22,8 @@ class YOLOv9_bb_onnx : public YoloONNX
 		//84: 80 COCO classes + x + y + w + h
 		constexpr int shapeDataSize = 4;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -90,8 +90,8 @@ class YOLOv9_bb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height));

From 1a2b8096438f0e37b7f747a94eecf70857231dcc Mon Sep 17 00:00:00 2001
From: Nuzhny007 <nuzhny@mail.ru>
Date: Tue, 10 Dec 2024 23:46:43 +0300
Subject: [PATCH 2/2] Update RuCLIP library

---
 thirdparty/ruclip/RuCLIPProcessor.cpp         | 24 +++++++
 thirdparty/ruclip/RuCLIPProcessor.h           | 70 ++++++++-----------
 .../youtokentome/third_party/flat_hash_map.h  |  4 +-
 3 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/thirdparty/ruclip/RuCLIPProcessor.cpp b/thirdparty/ruclip/RuCLIPProcessor.cpp
index 8f617ee2..242ef31c 100644
--- a/thirdparty/ruclip/RuCLIPProcessor.cpp
+++ b/thirdparty/ruclip/RuCLIPProcessor.cpp
@@ -1,5 +1,29 @@
 #include "RuCLIPProcessor.h"
 
+///
+torch::Tensor CVMatToTorchTensor(const cv::Mat img, const bool perm = true)
+{
+	auto tensor_image = torch::from_blob(img.data, { img.rows, img.cols, img.channels() }, at::kByte);
+	if (perm)
+		tensor_image = tensor_image.permute({ 2,0,1 });
+	tensor_image.unsqueeze_(0);
+	tensor_image = tensor_image.toType(c10::kFloat).div(255);
+	return tensor_image;		//tensor_image.clone();
+}
+
+///
+cv::Mat TorchTensorToCVMat(const torch::Tensor tensor_image, const bool perm = true)
+{
+	auto t = tensor_image.detach().squeeze().cpu();
+	if (perm)
+		t = t.permute({ 1, 2, 0 });
+	t = t.mul(255).clamp(0, 255).to(torch::kU8);
+	cv::Mat result_img;
+	cv::Mat(static_cast<int>(t.size(0)), static_cast<int>(t.size(1)), CV_MAKETYPE(CV_8U, t.sizes().size() >= 3 ? static_cast<int>(t.size(2)) : 1), t.data_ptr()).copyTo(result_img);
+	return result_img;
+}
+
+///
 RuCLIPProcessor :: RuCLIPProcessor(
 	const std::string& tokenizer_path,
 	const int image_size /*= 224*/,
diff --git a/thirdparty/ruclip/RuCLIPProcessor.h b/thirdparty/ruclip/RuCLIPProcessor.h
index be0a1cfa..6c6ac18e 100644
--- a/thirdparty/ruclip/RuCLIPProcessor.h
+++ b/thirdparty/ruclip/RuCLIPProcessor.h
@@ -8,45 +8,6 @@
 #include <filesystem>
 #include <fstream>
 
-inline torch::Tensor CVMatToTorchTensor(const cv::Mat img, const bool perm = true)
-{
-	auto tensor_image = torch::from_blob(img.data, { img.rows, img.cols, img.channels() }, at::kByte);
-	if (perm)
-		tensor_image = tensor_image.permute({ 2,0,1 });
-	tensor_image.unsqueeze_(0);
-	tensor_image = tensor_image.toType(c10::kFloat).div(255);
-	return tensor_image;		//tensor_image.clone();
-}
-
-inline cv::Mat TorchTensorToCVMat(const torch::Tensor tensor_image, const bool perm = true)
-{
-	auto t = tensor_image.detach().squeeze().cpu();
-	if (perm)
-		t = t.permute({ 1, 2, 0 });
-	t = t.mul(255).clamp(0, 255).to(torch::kU8);
-	cv::Mat result_img;
-	cv::Mat(static_cast<int>(t.size(0)), static_cast<int>(t.size(1)), CV_MAKETYPE(CV_8U, t.sizes().size() >= 3 ? static_cast<int>(t.size(2)) : 1), t.data_ptr()).copyTo(result_img);
-	return result_img;
-}
-
-//template <typename T>
-//std::basic_string<T> lowercase(const std::basic_string<T>& s)
-//{
-//	std::basic_string<T> s2 = s;
-//	std::transform(s2.begin(), s2.end(), s2.begin(),
-//		[](const T v) { return static_cast<T>(std::tolower(v)); });
-//	return s2;
-//}
-//
-//template <typename T>
-//std::basic_string<T> uppercase(const std::basic_string<T>& s)
-//{
-//	std::basic_string<T> s2 = s;
-//	std::transform(s2.begin(), s2.end(), s2.begin(),
-//		[](const T v) { return static_cast<T>(std::toupper(v)); });
-//	return s2;
-//}
-
 ///
 class RuCLIPProcessor
 {
@@ -95,3 +56,34 @@ class RuCLIPProcessor
 
 	std::vector<torch::Tensor> m_textsTensors;
 };
+
+//relevancy for batch size == 1 at this moment,   float lv = result.index({0,0}).item<float>();
+///
+///std::vector<torch::Tensor> canon_texts_tensors;
+///canon_texts_tensors.push_back(ClipProcessor->EncodeText(std::string("объект")));
+///canon_texts_tensors.push_back(ClipProcessor->EncodeText(std::string("вещи")));
+///canon_texts_tensors.push_back(ClipProcessor->EncodeText(std::string("текстура")));
+///int negatives_len =  (int)canon_texts_tensors.size();
+///auto canon_features = Clip->EncodeText(torch::stack(canon_texts_tensors).to(Device)).to(torch::kCPU); ///[3, 768]
+///canon_features = canon_features / canon_features.norm(2/*L2*/, -1, true);
+///auto input = ClipProcessor->EncodeText(std::string("малый барабан"));
+///auto text_features = Clip->EncodeText(input.unsqueeze(0).to(Device)).to(torch::kCPU);		///[1, 768]
+///text_features = text_features / text_features.norm(2/*L2*/, -1, true);
+///torch::Tensor image_features = PyramidClipEmbedding.GetPixelValue(i,j,0.5f,img_id,pyramid_embedder_properties,cv::Size(data.W, data.H)).to(torch::kCPU);
+///image_features = image_features / image_features.norm(2/*L2*/, -1, true);
+///torch::Tensor rel = Relevancy(image_features, text_features, canon_features);
+///float lv = rel.index({0,0}).item<float>();
+inline torch::Tensor Relevancy(torch::Tensor embeds, torch::Tensor positives, torch::Tensor negatives)
+{
+	auto embeds2 = torch::cat({ positives, negatives });
+	auto logits = /*scale * */torch::mm(embeds, embeds2.t());  //[batch_size x phrases]
+	auto positive_vals = logits.index({ "...", torch::indexing::Slice(0, 1) });  // [batch_size x 1]
+	auto negative_vals = logits.index({ "...", torch::indexing::Slice(1, torch::indexing::None) });		// [batch_size x negative_phrase_n]
+	auto repeated_pos = positive_vals.repeat({ 1, negatives.sizes()[0] });  //[batch_size x negative_phrase_n]
+	auto sims = torch::stack({ repeated_pos, negative_vals }, -1);   //[batch_size x negative_phrase_n x 2]
+	auto smx = torch::softmax(10 * sims, -1);                      // [batch_size x negative_phrase_n x 2]
+	auto best_id = smx.index({ "...", 0 }).argmin(1);                // [batch_size x 2]
+	auto result = torch::gather(smx, 1, best_id.index({ "...", torch::indexing::None, torch::indexing::None }).expand({ best_id.sizes()[0], negatives.sizes()[0], 2 })
+	).index({ torch::indexing::Slice(), 0, torch::indexing::Slice() });// [batch_size x 2]
+	return result;
+}
diff --git a/thirdparty/ruclip/youtokentome/third_party/flat_hash_map.h b/thirdparty/ruclip/youtokentome/third_party/flat_hash_map.h
index 4c4aa287..b900359e 100644
--- a/thirdparty/ruclip/youtokentome/third_party/flat_hash_map.h
+++ b/thirdparty/ruclip/youtokentome/third_party/flat_hash_map.h
@@ -12,6 +12,8 @@
 #include <iterator>
 #include <utility>
 #include <type_traits>
+#include <stdexcept>
+
 
 #ifdef _MSC_VER
 #define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
@@ -1499,4 +1501,4 @@ namespace vkcom
         typedef vkcom::power_of_two_hash_policy hash_policy;
     };
 
-} // end namespace vkcom
\ No newline at end of file
+} // end namespace vkcom