diff --git a/FWCore/Services/test/test_threadPool_service.cppunit.cu b/FWCore/Services/test/test_threadPool_service.cppunit.cu index e0d18a1ab53fb..cc1676823ff30 100644 --- a/FWCore/Services/test/test_threadPool_service.cppunit.cu +++ b/FWCore/Services/test/test_threadPool_service.cppunit.cu @@ -8,9 +8,7 @@ #include #include #include - #include - #include #include @@ -25,6 +23,7 @@ #include "cppunit/extensions/HelperMacros.h" #include "Utilities/Testing/interface/CppUnit_testdriver.icpp" +#define PI 3.141592 using namespace std; using namespace edm; @@ -34,21 +33,27 @@ class TestThreadPoolService: public CppUnit::TestFixture { //CPPUNIT_TEST(passServiceArgTest); CPPUNIT_TEST(CUDATest); CPPUNIT_TEST(CUDAAutolaunchManagedTest); + CPPUNIT_TEST(CUDAAutolaunchCUDAPTRTest); + CPPUNIT_TEST(CUDAAutolaunch2Dconfig); CPPUNIT_TEST(timeBenchmark); CPPUNIT_TEST_SUITE_END(); public: void setUp(); void tearDown() { - //(*poolPtr)->clearTasks(); - cout<<"\n"; + (*poolPtr)->clearTasks(); + (*poolPtr)->stopWorkers(); + cout<<"\n\n"; } void basicUseTest(); //!< @brief Test behaviour if the task itself enqueues another task in same pool void passServiceArgTest(); - //!< @brief Test scheduling many threads that launch CUDA kernels + //!< @brief Test scheduling many threads that launch CUDA kernels (pool.getFuture) void CUDATest(); //!< @brief Test auto launch cuda kernel with its arguments in managed memory void CUDAAutolaunchManagedTest(); + void CUDAAutolaunchCUDAPTRTest(); + //!< @brief Use auto config to manually configure a 2D kernel launch + void CUDAAutolaunch2Dconfig(); void timeBenchmark(); private: void print_id(int id); @@ -62,8 +67,11 @@ private: const int BLOCK_SIZE= 32; ServiceToken serviceToken; - unique_ptr> poolPtr; + string serviceConfig= "import FWCore.ParameterSet.Config as cms\n" + "process = cms.Process('testThreadPoolService')\n" + "process.ThreadPoolService = cms.Service('ThreadPoolService')\n"; unique_ptr operate; + unique_ptr> poolPtr; }; ///registration of the test so that the runner can find it @@ -79,24 +87,36 @@ __global__ void longKernel(const int n, const int times, const float* in, float* } } } +__global__ void matAddKernel(int m, int n, const float* __restrict__ A, + const float* __restrict__ B, float* __restrict__ C) +{ + int x= blockIdx.x*blockDim.x + threadIdx.x; + int y= blockIdx.y*blockDim.y + threadIdx.y; + + // ### Difference between manual and automatic kernel grid: + if (x vec; - vec.push_back(pSet); - operate= unique_ptr( - new ServiceRegistry::Operate(edm::ServiceRegistry::createSet(vec))); - poolPtr= unique_ptr>( - new Service); - //(*poolPtr)->startWorkers(); - cout<<"[ThreadPoolServiceTest::init] Service initialized\n"; } + //Alternative way to setup Services + /*ParameterSet pSet; + pSet.addParameter("@service_type", string("ThreadPoolService")); + vector vec; + vec.push_back(pSet);*/ + serviceToken= edm::ServiceRegistry::createServicesFromConfig(serviceConfig); + operate= unique_ptr( + //new ServiceRegistry::Operate(edm::ServiceRegistry::createSet(vec))); + new ServiceRegistry::Operate(serviceToken)); + poolPtr.reset(new Service()); + (*poolPtr)->startWorkers(); } void TestThreadPoolService::print_id(int id) { unique_lock lck(mtx); @@ -125,7 +145,7 @@ void TestThreadPoolService::cudaTask(int n, int i, const float* din, int times){ void TestThreadPoolService::basicUseTest() { - cout<<"\nStarting basic test...\n"; + cout<<"Starting basic test...\n"; (*poolPtr)->getFuture([]() {cout<<"Empty task\n";}).get(); vector> futures; const int N= 30; @@ -143,7 +163,7 @@ void TestThreadPoolService::basicUseTest() } void TestThreadPoolService::passServiceArgTest() { - cout<<"\nStarting passServiceArg test...\n" + cout<<"Starting passServiceArg test...\n" <<"(requires >1 thread, otherwise will never finish)\n"; (*poolPtr)->getFuture([&]() { cout<<"Recursive enqueue #1\n"; @@ -158,7 +178,7 @@ void TestThreadPoolService::passServiceArgTest() } void TestThreadPoolService::CUDATest() { - cout<<"\nStarting CUDA test...\n"; + cout<<"Starting CUDA test...\n"; vector> futures; const int N= 30; @@ -177,11 +197,10 @@ void TestThreadPoolService::CUDATest() } for (auto& future: futures) future.get(); } - #define TOLERANCE 5e-1 void TestThreadPoolService::CUDAAutolaunchManagedTest() { - cout<<"\nStarting CUDA autolaunch (managed) test...\n"; + cout<<"Starting CUDA autolaunch (managed) test...\n"; float *in, *out; const int n= 10000000, times= 1000; cudaMallocManaged(&in, n*sizeof(float)); //cudaMemAttachHost? @@ -211,11 +230,91 @@ void TestThreadPoolService::CUDAAutolaunchManagedTest() cudaFree(in); cudaFree(out); } +void TestThreadPoolService::CUDAAutolaunchCUDAPTRTest() +{ + cout<<"Starting CUDA autolaunch (managed) test...\n"; + const int n= 10000000, times= 1000; + cudaPointer in(n); + cudaPointer out(n); + for(int i=0; iconfigureLaunch(n, longKernel)); + (*poolPtr)->cudaLaunchManaged(execPol, longKernel, (int)n,(int)times, + const_cast(in.p),out.p).get(); + for(int i=0; iTOLERANCE || times*in.p[i]-out.p[i]<-TOLERANCE){ + cout<<"ERROR: i="<cudaLaunchManaged(execPol, longKernel, (int)n,(int)times, + const_cast(in.p),out.p).get(); + for(int i=0; iTOLERANCE || times*in.p[i]-out.p[i]<-TOLERANCE){ + cout<<"ERROR: i="<> futVec(threadN); + //Recommended launch configuration (1D) + cudaConfig::ExecutionPolicy execPol((*poolPtr)->configureLaunch(n*m, matAddKernel)); + //Explicitly set desired launch config (2D) based on the previous recommendation + const unsigned blockSize= sqrt(execPol.getBlockSize().x); + execPol.setBlockSize({blockSize, blockSize}).autoGrid({n,m}); + //Semi-manually launch GPU tasks + for(int thread=0; threadcudaLaunchManaged(execPol, matAddKernel, m, n, + A[thread],B[thread],C[thread]); + } + cout<<"Launch config:\nBlock="<& elt) { + elt.get(); + }); + + for(int thread=0; thread TOLERANCE){ + /*cout << "ERROR! thread="<getFuture([] (){ - this_thread::sleep_for(chrono::microseconds(1)); + //for (register short k=0; k<1; k++) + // cout<<""; }); for_each(futVec.begin(), futVec.end(), [] (future& elt) { elt.get(); @@ -238,5 +339,5 @@ void TestThreadPoolService::timeBenchmark() diff += (i>0)? end-start: start-start; } - cout << "ThreadPoolService normal operation: "<< chrono::duration (diff).count()/N << " ns" << endl; + cout << "ThreadPoolService normal operation: "<< chrono::duration (diff).count()/(N/threadN) << " ns" << endl; }