From bd7ae44144d7c4768fb9143831902d87388e9dec Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Tue, 10 Dec 2024 22:52:23 +0300 Subject: [PATCH 1/2] Correct aspect ratio after frame to input resizing --- src/Detector/tensorrt_yolo/YoloONNX.cpp | 7 ++++--- src/Detector/tensorrt_yolo/YoloONNX.hpp | 7 ++++--- src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp | 8 +++---- src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp | 8 +++---- .../tensorrt_yolo/YoloONNXv11_instance.hpp | 8 +++---- .../tensorrt_yolo/YoloONNXv11_obb.hpp | 8 +++---- src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp | 21 ++++++++----------- src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp | 21 ++++++++----------- .../tensorrt_yolo/YoloONNXv7_instance.hpp | 8 +++---- src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp | 8 +++---- .../tensorrt_yolo/YoloONNXv8_instance.hpp | 8 +++---- src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp | 8 +++---- src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp | 8 +++---- 13 files changed, 62 insertions(+), 66 deletions(-) diff --git a/src/Detector/tensorrt_yolo/YoloONNX.cpp b/src/Detector/tensorrt_yolo/YoloONNX.cpp index 93ca4435..690be11c 100644 --- a/src/Detector/tensorrt_yolo/YoloONNX.cpp +++ b/src/Detector/tensorrt_yolo/YoloONNX.cpp @@ -326,8 +326,9 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector& sampleImages) } } -#if 0 + m_resizedROI = cv::Rect(0, 0, inputW, inputH); +#if 1 // resize the DsImage with scale const float imgHeight = static_cast(sampleImages[0].rows); const float imgWidth = static_cast(sampleImages[0].cols); @@ -351,7 +352,7 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector& sampleImages) assert(2 * yOffset + resizeH == inputH); cv::Size scaleSize(inputW, inputH); - cv::Rect roiRect(xOffset, yOffset, resizeW, resizeH); + m_resizedROI = cv::Rect(xOffset, yOffset, resizeW, resizeH); if (m_resizedBatch.size() < sampleImages.size()) m_resizedBatch.resize(sampleImages.size()); @@ -361,7 +362,7 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector& sampleImages) { if (m_resizedBatch[b].size() != scaleSize) m_resizedBatch[b] = cv::Mat(scaleSize, sampleImages[b].type(), cv::Scalar::all(128)); - cv::resize(sampleImages[b], cv::Mat(m_resizedBatch[b], roiRect), roiRect.size(), 0, 0, cv::INTER_LINEAR); + cv::resize(sampleImages[b], cv::Mat(m_resizedBatch[b], m_resizedROI), m_resizedROI.size(), 0, 0, cv::INTER_LINEAR); cv::split(m_resizedBatch[b], m_inputChannels[b]); std::swap(m_inputChannels[b][0], m_inputChannels[b][2]); } diff --git a/src/Detector/tensorrt_yolo/YoloONNX.hpp b/src/Detector/tensorrt_yolo/YoloONNX.hpp index 8152d166..ffd2f0bf 100644 --- a/src/Detector/tensorrt_yolo/YoloONNX.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNX.hpp @@ -79,9 +79,10 @@ class YoloONNX size_t GetNumClasses() const; protected: - SampleYoloParams m_params; //!< The parameters for the sample. - nvinfer1::Dims m_inputDims; //!< The dimensions of the input to the network. - std::vector m_outpuDims; //!< The dimensions of the input to the network. + SampleYoloParams m_params; //!< The parameters for the sample + nvinfer1::Dims m_inputDims; //!< The dimensions of the input to the network + std::vector m_outpuDims; //!< The dimensions of the input to the network + cv::Rect m_resizedROI; //!< Input frame resized into input dimensions with the frame aspect ratio virtual std::vector GetResult(size_t imgIdx, int keep_topk, const std::vector& outputs, cv::Size frameSize) = 0; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp index 137dbf8e..648e420f 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp @@ -20,8 +20,8 @@ class YOLOv10_bb_onnx : public YoloONNX //0: name: images, size: 1x3x640x640 //1: name: output0, size: 1x300x6 - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); auto output = outputs[0]; @@ -51,8 +51,8 @@ class YOLOv10_bb_onnx : public YoloONNX //if (i == 0) // std::cout << i << ": " << output[k + 0] << " " << output[k + 1] << " " << output[k + 2] << " " << output[k + 3] << " " << output[k + 4] << " " << output[k + 5] << std::endl; - float x = fw * output[k + 0]; - float y = fh * output[k + 1]; + float x = fw * (output[k + 0] - m_resizedROI.x); + float y = fh * (output[k + 1] - m_resizedROI.y); float width = fw * (output[k + 2] - output[k + 0]); float height = fh * (output[k + 3] - output[k + 1]); float objectConf = output[k + 4]; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp index 9103bfa6..654f9ea4 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp @@ -20,8 +20,8 @@ class YOLOv11_bb_onnx : public YoloONNX //0: name: images, size: 1x3x640x640 //1: name: output0, size: 1x84x8400 - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); auto output = outputs[0]; @@ -88,8 +88,8 @@ class YOLOv11_bb_onnx : public YoloONNX confidences.push_back(objectConf); // (center x, center y, width, height) to (x, y, w, h) - float x = fw * (output[k] - output[k + 2] / 2); - float y = fh * (output[k + 1] - output[k + 3] / 2); + float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x); + float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height)); diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp index 54fc6b01..30261daf 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp @@ -17,8 +17,8 @@ class YOLOv11_instance_onnx : public YoloONNX { std::vector resBoxes; - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); size_t outInd = (outputs.size() == 0) ? 1 : 0; size_t segInd = (outputs.size() == 0) ? 0 : 1; @@ -155,8 +155,8 @@ class YOLOv11_instance_onnx : public YoloONNX if (objectConf >= m_params.confThreshold) { // (center x, center y, width, height) to (x, y, w, h) - float x = fw * (output[k] - output[k + 2] / 2); - float y = fh * (output[k + 1] - output[k + 3] / 2); + float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x); + float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp index 7c2b98ce..c35b16c9 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp @@ -22,8 +22,8 @@ class YOLOv11_obb_onnx : public YoloONNX //20: 15 DOTA classes + x + y + w + h + a constexpr int shapeDataSize = 5; - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); auto output = outputs[0]; @@ -96,8 +96,8 @@ class YOLOv11_obb_onnx : public YoloONNX confidences.push_back(objectConf); // (center x, center y, width, height) - float cx = fw * output[k]; - float cy = fh * output[k + 1]; + float cx = fw * (output[k] - m_resizedROI.x); + float cy = fh * (output[k + 1] - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; float angle = 180.f * output[k + nc + shapeDataSize - 1] / M_PI; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp index 4d6a6268..ef82ca23 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp @@ -17,6 +17,9 @@ class YOLOv6_bb_onnx : public YoloONNX { std::vector resBoxes; + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); + if (outputs.size() == 4) { auto dets = reinterpret_cast(outputs[0]); @@ -26,9 +29,6 @@ class YOLOv6_bb_onnx : public YoloONNX int objectsCount = m_outpuDims[1].d[1]; - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); - //std::cout << "Dets[" << imgIdx << "] = " << dets[imgIdx] << ", objectsCount = " << objectsCount << std::endl; const size_t step1 = imgIdx * objectsCount; @@ -41,8 +41,8 @@ class YOLOv6_bb_onnx : public YoloONNX int classId = classes[i + step1]; if (class_conf >= m_params.confThreshold) { - float x = fw * boxes[k + 0 + step2]; - float y = fh * boxes[k + 1 + step2]; + float x = fw * (boxes[k + 0 + step2] - m_resizedROI.x); + float y = fh * (boxes[k + 1 + step2] - m_resizedROI.y); float width = fw * boxes[k + 2 + step2] - x; float height = fh * boxes[k + 3 + step2] - y; @@ -57,9 +57,6 @@ class YOLOv6_bb_onnx : public YoloONNX } else if (outputs.size() == 1) { - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); - auto output = outputs[0]; size_t ncInd = 2; @@ -96,8 +93,8 @@ class YOLOv6_bb_onnx : public YoloONNX int classId = cvRound(output[k + 5]); if (class_conf >= m_params.confThreshold) { - float x = fw * output[k + 1]; - float y = fh * output[k + 2]; + float x = fw * (output[k + 1] - m_resizedROI.x); + float y = fh * (output[k + 2] - m_resizedROI.y); float width = fw * (output[k + 3] - output[k + 1]); float height = fh * (output[k + 4] - output[k + 2]); @@ -150,8 +147,8 @@ class YOLOv6_bb_onnx : public YoloONNX if (object_conf >= m_params.confThreshold) { // (center x, center y, width, height) to (x, y, w, h) - float x = fw * (output[k] - output[k + 2] / 2); - float y = fh * (output[k + 1] - output[k + 3] / 2); + float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x); + float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp index 78a383b4..946daf4c 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp @@ -17,6 +17,9 @@ class YOLOv7_bb_onnx : public YoloONNX { std::vector resBoxes; + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); + if (outputs.size() == 4) { auto dets = reinterpret_cast(outputs[0]); @@ -26,9 +29,6 @@ class YOLOv7_bb_onnx : public YoloONNX int objectsCount = m_outpuDims[1].d[1]; - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); - //std::cout << "Dets[" << imgIdx << "] = " << dets[imgIdx] << ", objectsCount = " << objectsCount << std::endl; const size_t step1 = imgIdx * objectsCount; @@ -41,8 +41,8 @@ class YOLOv7_bb_onnx : public YoloONNX int classId = classes[i + step1]; if (class_conf >= m_params.confThreshold) { - float x = fw * boxes[k + 0 + step2]; - float y = fh * boxes[k + 1 + step2]; + float x = fw * (boxes[k + 0 + step2] - m_resizedROI.x); + float y = fh * (boxes[k + 1 + step2] - m_resizedROI.y); float width = fw * boxes[k + 2 + step2] - x; float height = fh * boxes[k + 3 + step2] - y; @@ -57,9 +57,6 @@ class YOLOv7_bb_onnx : public YoloONNX } else if (outputs.size() == 1) { - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); - auto output = outputs[0]; size_t ncInd = 2; @@ -96,8 +93,8 @@ class YOLOv7_bb_onnx : public YoloONNX int classId = cvRound(output[k + 5]); if (class_conf >= m_params.confThreshold) { - float x = fw * output[k + 1]; - float y = fh * output[k + 2]; + float x = fw * (output[k + 1] - m_resizedROI.x); + float y = fh * (output[k + 2] - m_resizedROI.y); float width = fw * (output[k + 3] - output[k + 1]); float height = fh * (output[k + 4] - output[k + 2]); @@ -150,8 +147,8 @@ class YOLOv7_bb_onnx : public YoloONNX if (object_conf >= m_params.confThreshold) { // (center x, center y, width, height) to (x, y, w, h) - float x = fw * (output[k] - output[k + 2] / 2); - float y = fh * (output[k + 1] - output[k + 3] / 2); + float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x); + float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp index 73a5d671..1f962392 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp @@ -18,8 +18,8 @@ class YOLOv7_instance_onnx : public YoloONNX { std::vector resBoxes; - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); size_t outInd = (outputs.size() == 0) ? 0 : 1; size_t segInd = (outputs.size() == 0) ? 1 : 0; @@ -123,8 +123,8 @@ class YOLOv7_instance_onnx : public YoloONNX if (object_conf >= m_params.confThreshold) { // (center x, center y, width, height) to (x, y, w, h) - float x = fw * (output[k] - output[k + 2] / 2); - float y = fh * (output[k + 1] - output[k + 3] / 2); + float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x); + float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp index 9a21e397..4e24d2f5 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp @@ -20,8 +20,8 @@ class YOLOv8_bb_onnx : public YoloONNX //0: name: images, size: 1x3x640x640 //1: name: output0, size: 1x84x8400 - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); auto output = outputs[0]; @@ -88,8 +88,8 @@ class YOLOv8_bb_onnx : public YoloONNX confidences.push_back(objectConf); // (center x, center y, width, height) to (x, y, w, h) - float x = fw * (output[k] - output[k + 2] / 2); - float y = fh * (output[k + 1] - output[k + 3] / 2); + float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x); + float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height)); diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp index dbdf20fd..dff444b8 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp @@ -17,8 +17,8 @@ class YOLOv8_instance_onnx : public YoloONNX { std::vector resBoxes; - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); size_t outInd = (outputs.size() == 0) ? 0 : 1; size_t segInd = (outputs.size() == 0) ? 1 : 0; @@ -155,8 +155,8 @@ class YOLOv8_instance_onnx : public YoloONNX if (objectConf >= m_params.confThreshold) { // (center x, center y, width, height) to (x, y, w, h) - float x = fw * (output[k] - output[k + 2] / 2); - float y = fh * (output[k + 1] - output[k + 3] / 2); + float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x); + float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp index 4c39c5a4..e9a232dd 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp @@ -22,8 +22,8 @@ class YOLOv8_obb_onnx : public YoloONNX //20: 15 DOTA classes + x + y + w + h + a constexpr int shapeDataSize = 5; - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); auto output = outputs[0]; @@ -96,8 +96,8 @@ class YOLOv8_obb_onnx : public YoloONNX confidences.push_back(objectConf); // (center x, center y, width, height) - float cx = fw * output[k]; - float cy = fh * output[k + 1]; + float cx = fw * (output[k] - m_resizedROI.x); + float cy = fh * (output[k + 1] - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; float angle = 180.f * output[k + nc + shapeDataSize - 1] / M_PI; diff --git a/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp index f4c99ebd..a934877b 100644 --- a/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp +++ b/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp @@ -22,8 +22,8 @@ class YOLOv9_bb_onnx : public YoloONNX //84: 80 COCO classes + x + y + w + h constexpr int shapeDataSize = 4; - const float fw = static_cast(frameSize.width) / static_cast(m_inputDims.d[3]); - const float fh = static_cast(frameSize.height) / static_cast(m_inputDims.d[2]); + const float fw = static_cast(frameSize.width) / static_cast(m_resizedROI.width); + const float fh = static_cast(frameSize.height) / static_cast(m_resizedROI.height); auto output = outputs[0]; @@ -90,8 +90,8 @@ class YOLOv9_bb_onnx : public YoloONNX confidences.push_back(objectConf); // (center x, center y, width, height) to (x, y, w, h) - float x = fw * (output[k] - output[k + 2] / 2); - float y = fh * (output[k + 1] - output[k + 3] / 2); + float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x); + float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y); float width = fw * output[k + 2]; float height = fh * output[k + 3]; rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height)); From 1a2b8096438f0e37b7f747a94eecf70857231dcc Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Tue, 10 Dec 2024 23:46:43 +0300 Subject: [PATCH 2/2] Update RuCLIP library --- thirdparty/ruclip/RuCLIPProcessor.cpp | 24 +++++++ thirdparty/ruclip/RuCLIPProcessor.h | 70 ++++++++----------- .../youtokentome/third_party/flat_hash_map.h | 4 +- 3 files changed, 58 insertions(+), 40 deletions(-) diff --git a/thirdparty/ruclip/RuCLIPProcessor.cpp b/thirdparty/ruclip/RuCLIPProcessor.cpp index 8f617ee2..242ef31c 100644 --- a/thirdparty/ruclip/RuCLIPProcessor.cpp +++ b/thirdparty/ruclip/RuCLIPProcessor.cpp @@ -1,5 +1,29 @@ #include "RuCLIPProcessor.h" +/// +torch::Tensor CVMatToTorchTensor(const cv::Mat img, const bool perm = true) +{ + auto tensor_image = torch::from_blob(img.data, { img.rows, img.cols, img.channels() }, at::kByte); + if (perm) + tensor_image = tensor_image.permute({ 2,0,1 }); + tensor_image.unsqueeze_(0); + tensor_image = tensor_image.toType(c10::kFloat).div(255); + return tensor_image; //tensor_image.clone(); +} + +/// +cv::Mat TorchTensorToCVMat(const torch::Tensor tensor_image, const bool perm = true) +{ + auto t = tensor_image.detach().squeeze().cpu(); + if (perm) + t = t.permute({ 1, 2, 0 }); + t = t.mul(255).clamp(0, 255).to(torch::kU8); + cv::Mat result_img; + cv::Mat(static_cast(t.size(0)), static_cast(t.size(1)), CV_MAKETYPE(CV_8U, t.sizes().size() >= 3 ? static_cast(t.size(2)) : 1), t.data_ptr()).copyTo(result_img); + return result_img; +} + +/// RuCLIPProcessor :: RuCLIPProcessor( const std::string& tokenizer_path, const int image_size /*= 224*/, diff --git a/thirdparty/ruclip/RuCLIPProcessor.h b/thirdparty/ruclip/RuCLIPProcessor.h index be0a1cfa..6c6ac18e 100644 --- a/thirdparty/ruclip/RuCLIPProcessor.h +++ b/thirdparty/ruclip/RuCLIPProcessor.h @@ -8,45 +8,6 @@ #include #include -inline torch::Tensor CVMatToTorchTensor(const cv::Mat img, const bool perm = true) -{ - auto tensor_image = torch::from_blob(img.data, { img.rows, img.cols, img.channels() }, at::kByte); - if (perm) - tensor_image = tensor_image.permute({ 2,0,1 }); - tensor_image.unsqueeze_(0); - tensor_image = tensor_image.toType(c10::kFloat).div(255); - return tensor_image; //tensor_image.clone(); -} - -inline cv::Mat TorchTensorToCVMat(const torch::Tensor tensor_image, const bool perm = true) -{ - auto t = tensor_image.detach().squeeze().cpu(); - if (perm) - t = t.permute({ 1, 2, 0 }); - t = t.mul(255).clamp(0, 255).to(torch::kU8); - cv::Mat result_img; - cv::Mat(static_cast(t.size(0)), static_cast(t.size(1)), CV_MAKETYPE(CV_8U, t.sizes().size() >= 3 ? static_cast(t.size(2)) : 1), t.data_ptr()).copyTo(result_img); - return result_img; -} - -//template -//std::basic_string lowercase(const std::basic_string& s) -//{ -// std::basic_string s2 = s; -// std::transform(s2.begin(), s2.end(), s2.begin(), -// [](const T v) { return static_cast(std::tolower(v)); }); -// return s2; -//} -// -//template -//std::basic_string uppercase(const std::basic_string& s) -//{ -// std::basic_string s2 = s; -// std::transform(s2.begin(), s2.end(), s2.begin(), -// [](const T v) { return static_cast(std::toupper(v)); }); -// return s2; -//} - /// class RuCLIPProcessor { @@ -95,3 +56,34 @@ class RuCLIPProcessor std::vector m_textsTensors; }; + +//relevancy for batch size == 1 at this moment, float lv = result.index({0,0}).item(); +/// +///std::vector canon_texts_tensors; +///canon_texts_tensors.push_back(ClipProcessor->EncodeText(std::string("объект"))); +///canon_texts_tensors.push_back(ClipProcessor->EncodeText(std::string("вещи"))); +///canon_texts_tensors.push_back(ClipProcessor->EncodeText(std::string("текстура"))); +///int negatives_len = (int)canon_texts_tensors.size(); +///auto canon_features = Clip->EncodeText(torch::stack(canon_texts_tensors).to(Device)).to(torch::kCPU); ///[3, 768] +///canon_features = canon_features / canon_features.norm(2/*L2*/, -1, true); +///auto input = ClipProcessor->EncodeText(std::string("малый барабан")); +///auto text_features = Clip->EncodeText(input.unsqueeze(0).to(Device)).to(torch::kCPU); ///[1, 768] +///text_features = text_features / text_features.norm(2/*L2*/, -1, true); +///torch::Tensor image_features = PyramidClipEmbedding.GetPixelValue(i,j,0.5f,img_id,pyramid_embedder_properties,cv::Size(data.W, data.H)).to(torch::kCPU); +///image_features = image_features / image_features.norm(2/*L2*/, -1, true); +///torch::Tensor rel = Relevancy(image_features, text_features, canon_features); +///float lv = rel.index({0,0}).item(); +inline torch::Tensor Relevancy(torch::Tensor embeds, torch::Tensor positives, torch::Tensor negatives) +{ + auto embeds2 = torch::cat({ positives, negatives }); + auto logits = /*scale * */torch::mm(embeds, embeds2.t()); //[batch_size x phrases] + auto positive_vals = logits.index({ "...", torch::indexing::Slice(0, 1) }); // [batch_size x 1] + auto negative_vals = logits.index({ "...", torch::indexing::Slice(1, torch::indexing::None) }); // [batch_size x negative_phrase_n] + auto repeated_pos = positive_vals.repeat({ 1, negatives.sizes()[0] }); //[batch_size x negative_phrase_n] + auto sims = torch::stack({ repeated_pos, negative_vals }, -1); //[batch_size x negative_phrase_n x 2] + auto smx = torch::softmax(10 * sims, -1); // [batch_size x negative_phrase_n x 2] + auto best_id = smx.index({ "...", 0 }).argmin(1); // [batch_size x 2] + auto result = torch::gather(smx, 1, best_id.index({ "...", torch::indexing::None, torch::indexing::None }).expand({ best_id.sizes()[0], negatives.sizes()[0], 2 }) + ).index({ torch::indexing::Slice(), 0, torch::indexing::Slice() });// [batch_size x 2] + return result; +} diff --git a/thirdparty/ruclip/youtokentome/third_party/flat_hash_map.h b/thirdparty/ruclip/youtokentome/third_party/flat_hash_map.h index 4c4aa287..b900359e 100644 --- a/thirdparty/ruclip/youtokentome/third_party/flat_hash_map.h +++ b/thirdparty/ruclip/youtokentome/third_party/flat_hash_map.h @@ -12,6 +12,8 @@ #include #include #include +#include + #ifdef _MSC_VER #define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__ @@ -1499,4 +1501,4 @@ namespace vkcom typedef vkcom::power_of_two_hash_policy hash_policy; }; -} // end namespace vkcom \ No newline at end of file +} // end namespace vkcom