Merge pull request #271 from sampath1117/sr/audio_merge_fixes

Merge fixes for audio augmentations
r-abishek · May 9, 2024 · 982f18e · 982f18e
2 parents 1a3015c + 5470941
commit 982f18e
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 31 deletions.
diff --git a/include/rppdefs.h b/include/rppdefs.h
@@ -428,15 +428,6 @@ typedef enum
     REFLECT
 } RpptAudioBorderType;
 
-/*! \brief RPPT Spectrogram Layout enum
- * \ingroup group_rppdefs
- */
-typedef enum
-{
-    FT = 0,  //Frequency Major
-    TF,      //Time Major
-} RpptSpectrogramLayout;
-
 /*! \brief RPPT Mel Scale Formula
  * \ingroup group_rppdefs
  */

diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
@@ -115,7 +115,7 @@ RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
  * \param [in] srcPtr source tensor in HOST memory
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
  * \param [out] dstPtr destination tensor in HOST memory
- * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT / NTF)
  * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
  * \param [in] centerWindows indicates whether extracted windows should be padded so that the window function is centered at multiples of window_step
  * \param [in] reflectPadding indicates the padding policy when sampling outside the bounds of the signal
@@ -124,20 +124,19 @@ RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
  * \param [in] power exponent of the magnitude of the spectrum
  * \param [in] windowLength window size in number of samples
  * \param [in] windowStep step between the STFT windows in number of samples
- * \param [in] layout specifies output layout of spectrogram
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
  */
-RppStatus rppt_spectrogram_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, bool centerWindows, bool reflectPadding, Rpp32f *windowFunction, Rpp32s nfft, Rpp32s power, Rpp32s windowLength, Rpp32s windowStep, RpptSpectrogramLayout layout, rppHandle_t rppHandle);
+RppStatus rppt_spectrogram_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, bool centerWindows, bool reflectPadding, Rpp32f *windowFunction, Rpp32s nfft, Rpp32s power, Rpp32s windowLength, Rpp32s windowStep, rppHandle_t rppHandle);
 
 /*! \brief Mel filter bank augmentation HOST backend
  * \details Mel filter bank augmentation for audio data
  * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT / NTF)
+ * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT)
  * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT / NTF)
+ * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT)
  * \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
  * \param[in] maxFreq maximum frequency if not provided maxFreq = sampleRate / 2
  * \param[in] minFreq minimum frequency

diff --git a/src/modules/cpu/kernel/pre_emphasis_filter.hpp b/src/modules/cpu/kernel/pre_emphasis_filter.hpp
@@ -50,7 +50,7 @@ RppStatus pre_emphasis_filter_host_tensor(Rpp32f *srcPtr,
         dstPtrTemp[0] = srcPtrTemp[0] - coeff * border;
 
         Rpp32s vectorIncrement = 8;
-        Rpp32s alignedLength = (bufferLength / 8) * 8;
+        Rpp32s alignedLength = (bufferLength / 8) * 8 - 8;
         __m256 pCoeff = _mm256_set1_ps(coeff);
 
         Rpp32s vectorLoopCount = 1;

diff --git a/src/modules/cpu/kernel/spectrogram.hpp b/src/modules/cpu/kernel/spectrogram.hpp
@@ -80,11 +80,10 @@ RppStatus spectrogram_host_tensor(Rpp32f *srcPtr,
                                   Rpp32s power,
                                   Rpp32s windowLength,
                                   Rpp32s windowStep,
-                                  RpptSpectrogramLayout layout,
                                   rpp::Handle& handle)
 {
     Rpp32s windowCenterOffset = 0;
-    bool vertical = (layout == RpptSpectrogramLayout::FT);
+    bool vertical = (dstDescPtr->layout == RpptLayout::NFT);
     if (centerWindows) windowCenterOffset = windowLength / 2;
     if (nfft == 0) nfft = windowLength;
     const Rpp32s numBins = nfft / 2 + 1;

diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -169,9 +169,10 @@ RppStatus rppt_spectrogram_host(RppPtr_t srcPtr,
                                 Rpp32s power,
                                 Rpp32s windowLength,
                                 Rpp32s windowStep,
-                                RpptSpectrogramLayout layout,
                                 rppHandle_t rppHandle)
 {
+    if ((dstDescPtr->layout != RpptLayout::NFT) && (dstDescPtr->layout != RpptLayout::NTF)) return RPP_ERROR_INVALID_DST_LAYOUT;
+
     if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
     {
         spectrogram_host_tensor(static_cast<Rpp32f*>(srcPtr),
@@ -186,7 +187,6 @@ RppStatus rppt_spectrogram_host(RppPtr_t srcPtr,
                                 power,
                                 windowLength,
                                 windowStep,
-                                layout,
                                 rpp::deref(rppHandle));
 
         return RPP_SUCCESS;

diff --git a/utilities/test_suite/HOST/Tensor_audio_host.cpp b/utilities/test_suite/HOST/Tensor_audio_host.cpp
@@ -138,9 +138,13 @@ int main(int argc, char **argv)
     RpptImagePatch *srcDims = (RpptImagePatch *) calloc(batchSize, sizeof(RpptImagePatch));
     RpptImagePatch *dstDims = (RpptImagePatch *) calloc(batchSize, sizeof(RpptImagePatch));
 
+    // buffers used for non silent region detection
+    Rpp32s detectedIndex[batchSize], detectionLength[batchSize];
+
     // run case-wise RPP API and measure time
     rppHandle_t handle;
     rppCreateWithBatchSize(&handle, srcDescPtr->n, 3);
+
     int noOfIterations = (int)audioNames.size() / batchSize;
     double maxWallTime = 0, minWallTime = 500, avgWallTime = 0;
     string testCaseName;
@@ -158,8 +162,6 @@ int main(int argc, char **argv)
                 case 0:
                 {
                     testCaseName = "non_silent_region_detection";
-                    Rpp32s detectedIndex[batchSize];
-                    Rpp32s detectionLength[batchSize];
                     Rpp32f cutOffDB = -60.0;
                     Rpp32s windowLength = 2048;
                     Rpp32f referencePower = 0.0f;
@@ -168,10 +170,6 @@ int main(int argc, char **argv)
                     startWallTime = omp_get_wtime();
                     rppt_non_silent_region_detection_host(inputf32, srcDescPtr, srcLengthTensor, detectedIndex, detectionLength, cutOffDB, windowLength, referencePower, resetInterval, handle);
 
-                    // QA mode - verify outputs with golden outputs. Below code doesn’t run for performance tests
-                    if (testType == 0)
-                        verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
-
                     break;
                 }
                 case 1:
@@ -238,15 +236,15 @@ int main(int argc, char **argv)
                     Rpp32s windowLength = 320;
                     Rpp32s windowStep = 160;
                     Rpp32s nfft = 512;
-                    RpptSpectrogramLayout layout = RpptSpectrogramLayout::FT;
+                    dstDescPtr->layout = RpptLayout::NFT;
 
                     int windowOffset = 0;
                     if(!centerWindows)
                         windowOffset = windowLength;
 
                     maxDstWidth = 0;
                     maxDstHeight = 0;
-                    if(layout == RpptSpectrogramLayout::FT)
+                    if(dstDescPtr->layout == RpptLayout::NFT)
                     {
                         for(int i = 0; i < noOfAudioFiles; i++)
                         {
@@ -274,7 +272,7 @@ int main(int argc, char **argv)
                     outputf32 = (Rpp32f *)realloc(outputf32, spectrogramBufferSize * sizeof(Rpp32f));
 
                     startWallTime = omp_get_wtime();
-                    rppt_spectrogram_host(inputf32, srcDescPtr, outputf32, dstDescPtr, srcLengthTensor, centerWindows, reflectPadding, windowFn, nfft, power, windowLength, windowStep, layout, handle);
+                    rppt_spectrogram_host(inputf32, srcDescPtr, outputf32, dstDescPtr, srcLengthTensor, centerWindows, reflectPadding, windowFn, nfft, power, windowLength, windowStep, handle);
 
                     break;
                 }
@@ -421,9 +419,9 @@ int main(int argc, char **argv)
         // QA mode - verify outputs with golden outputs. Below code doesn’t run for performance tests
         if (testType == 0)
         {
-            /* Run only if testCase is not 0
-            For testCase 0 verify_non_silent_region_detection function is used for QA testing */
-            if (testCase != 0)
+            if (testCase == 0)
+                verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
+            else
                 verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath);
 
             /* Dump the outputs to csv files for debugging