diff --git a/CMakeLists.txt b/CMakeLists.txt index bbb913e..dd8755e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,13 +87,7 @@ ENDFOREACH(subdir) LIST(APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH} /opt/rocm) FIND_PACKAGE(HIP CONFIG REQUIRED) -FIND_PACKAGE(HIPFFT CONFIG REQUIRED) - -IF(${HIP_COMPILER} STREQUAL "clang") - MESSAGE(STATUS "Using HIP-Clang compiler") -ELSE() - MESSAGE(FATAL_ERROR "HIP compiler ${HIP_COMPILER} not recognized!") -ENDIF() +FIND_PACKAGE(HIPFFT CONFIG) ADD_SUBDIRECTORY(platforms/hip) diff --git a/README.md b/README.md index 3bf46df..f148c6c 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,6 @@ AMD GPUs on [AMD ROCmâ„¢ open software platform](https://rocmdocs.amd.com). ## Installing with Conda -This plugin requires hipFFT and rocFFT, install them from ROCm repositories: - -```sh -apt install hipfft rocfft -``` - ```sh conda create -n openmm-env -c streamhpc -c conda-forge --strict-channel-priority openmm-hip conda activate openmm-env @@ -69,7 +63,7 @@ The plugin requires source code of OpenMM, it can be downloaded as an archive [here](https://github.com/openmm/openmm/releases) or as a Git repository: ```sh -git clone https://github.com/openmm/openmm.git -b 8.0.0 +git clone https://github.com/openmm/openmm.git -b 8.1.1 ``` To build the plugin, follow these steps: @@ -100,7 +94,7 @@ source code: ```sh mkdir build build-hip install -git clone https://github.com/openmm/openmm.git -b 8.0.0 +git clone https://github.com/openmm/openmm.git -b 8.1.1 cd build cmake ../openmm/ -D CMAKE_INSTALL_PREFIX=../install -D OPENMM_BUILD_COMMON=ON -D OPENMM_PYTHON_USER_INSTALL=ON make @@ -138,15 +132,25 @@ please try different backends: * the hipFFT/rocFFT-based implementation (`export OPENMM_FFT_BACKEND=1`); * the VkFFT-based implementation (`export OPENMM_FFT_BACKEND=2`); -### The kernel compilation: hipcc and hipRTC +The hipFFT/rocFFT-based implementation requires hipFFT and rocFFT libraries, otherwise it will be +disabled, install them from ROCm repositories before running cmake: + +```sh +apt install hipfft rocfft +``` -By default, the HIP Platform builds kernels with the hipcc compiler. To run the compiler, paths -in the following order are used: +If you see "libhipfft.so.0: cannot open shared object file: No such file or directory", run +`ldconfig`. + +### The kernel compilation: amdclang++ and hipRTC + +By default, the HIP Platform builds kernels with the amdclang++ compiler. To run the compiler, +paths in the following order are used: * `properties['HipCompiler']`, if it is passed to Context constructor; * `OPENMM_HIP_COMPILER` environment variable, if it is set; -* `${ROCM_PATH}/bin/hipcc`, if `ROCM_PATH` environment variable is set; -* `/opt/rocm/bin/hipcc` otherwise. +* `${ROCM_PATH}/bin/amdclang++`, if `ROCM_PATH` environment variable is set; +* `/opt/rocm/bin/amdclang++` otherwise. There is an alternative way to compile kernels: hipRTC, it is implemented by `plugins/hipcompiler`. To enable this way: @@ -154,6 +158,10 @@ There is an alternative way to compile kernels: hipRTC, it is implemented by * set `properties['HipAllowRuntimeCompiler'] = 'true'`; * set `OPENMM_USE_HIPRTC` environment variable to 1 (`export OPENMM_USE_HIPRTC=1`). +**Warning:** hipRTC from ROCm 6.0.0 has issues with ambiguous operators for vector and complex +types. It seems that they have been fixed in ROCm/clr's `develop` branch and likely OpenMM+hipRTC +will be usable with the next ROCm release. + ## License The HIP Platform uses OpenMM API under the terms of the MIT License. A copy of this license may diff --git a/platforms/hip/CMakeLists.txt b/platforms/hip/CMakeLists.txt index 866336a..8380625 100644 --- a/platforms/hip/CMakeLists.txt +++ b/platforms/hip/CMakeLists.txt @@ -110,13 +110,17 @@ IF (OPENMM_BUILD_SHARED_LIB) ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) ADD_DEPENDENCIES(${SHARED_TARGET} CommonKernels HipKernels) - TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB} hip::host hip::hipfft) + TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB} hip::host) SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_SHARED_LIBRARY") IF (APPLE) SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework HIP") ELSE (APPLE) SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}") ENDIF (APPLE) + IF(HIPFFT_FOUND) + TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC hip::hipfft) + TARGET_COMPILE_OPTIONS(${SHARED_TARGET} PUBLIC "-DOPENMM_HIP_WITH_HIPFFT") + ENDIF(HIPFFT_FOUND) INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET}) ENDIF (OPENMM_BUILD_SHARED_LIB) @@ -127,13 +131,17 @@ IF(OPENMM_BUILD_STATIC_LIB) ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) ADD_DEPENDENCIES(${STATIC_TARGET} CommonKernels HipKernels) - TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB_STATIC} hip::host hip::hipfft) + TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB_STATIC} hip::host) SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_STATIC_LIBRARY") IF (APPLE) SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework HIP") ELSE (APPLE) SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}") ENDIF (APPLE) + IF(HIPFFT_FOUND) + TARGET_LINK_LIBRARIES(${STATIC_TARGET} PUBLIC hip::hipfft) + TARGET_COMPILE_OPTIONS(${STATIC_TARGET} PUBLIC "-DOPENMM_HIP_WITH_HIPFFT") + ENDIF(HIPFFT_FOUND) INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET}) ENDIF(OPENMM_BUILD_STATIC_LIB) diff --git a/platforms/hip/include/HipKernels.h b/platforms/hip/include/HipKernels.h index 0c317b9..c1adf84 100644 --- a/platforms/hip/include/HipKernels.h +++ b/platforms/hip/include/HipKernels.h @@ -360,6 +360,18 @@ class HipCalcCustomCVForceKernel : public CommonCalcCustomCVForceKernel { } }; +/** + * This kernel is invoked by ATMForce to calculate the forces acting on the system and the energy of the system. + */ +class HipCalcATMForceKernel : public CommonCalcATMForceKernel { +public: + HipCalcATMForceKernel(std::string name, const Platform& platform, ComputeContext& cc) : CommonCalcATMForceKernel(name, platform, cc) { + } + ComputeContext& getInnerComputeContext(ContextImpl& innerContext) { + return *reinterpret_cast(innerContext.getPlatformData())->contexts[0]; + } +}; + } // namespace OpenMM #endif /*OPENMM_HIPKERNELS_H_*/ diff --git a/platforms/hip/include/HipNonbondedUtilities.h b/platforms/hip/include/HipNonbondedUtilities.h index d9fc3b9..9bc6485 100644 --- a/platforms/hip/include/HipNonbondedUtilities.h +++ b/platforms/hip/include/HipNonbondedUtilities.h @@ -9,7 +9,7 @@ * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * - * Portions copyright (c) 2009-2022 Stanford University and the Authors. * + * Portions copyright (c) 2009-2023 Stanford University and the Authors. * * Portions copyright (C) 2020-2023 Advanced Micro Devices, Inc. All Rights * * Reserved. * * Authors: Peter Eastman, Nicholas Curtis * @@ -83,8 +83,10 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded * @param kernel the code to evaluate the interaction * @param forceGroup the force group in which the interaction should be calculated + * @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should + * be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway. */ - void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector >& exclusionList, const std::string& kernel, int forceGroup); + void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList = true); /** * Add a nonbonded interaction to be evaluated by the default interaction kernel. * @@ -95,9 +97,11 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded * @param kernel the code to evaluate the interaction * @param forceGroup the force group in which the interaction should be calculated + * @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should + * be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway. * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list */ - void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList); + void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList); /** * Add a per-atom parameter that the default interaction kernel may depend on. */ @@ -336,12 +340,15 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { HipArray sortedBlocks; HipArray sortedBlockCenter; HipArray sortedBlockBoundingBox; + HipArray blockSizeRange; + HipArray largeBlockCenter; + HipArray largeBlockBoundingBox; HipArray oldPositions; HipArray rebuildNeighborList; HipSort* blockSorter; hipEvent_t downloadCountEvent; unsigned int* pinnedCountBuffer; - std::vector forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs; + std::vector forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs; std::vector > atomExclusions; std::vector parameters; std::vector arguments; @@ -349,7 +356,7 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities { std::map groupCutoff; std::map groupKernelSource; double lastCutoff; - bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList; + bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList, useLargeBlocks; int startTileIndex, startBlockIndex, numBlocks, numTilesInBatch, maxExclusions; int numForceThreadBlocks, forceThreadBlockSize, findInteractingBlocksThreadBlockSize, numAtoms, groupFlags; unsigned int maxTiles, maxSinglePairs, tilesAfterReorder; @@ -368,6 +375,7 @@ class HipNonbondedUtilities::KernelSet { std::string source; hipFunction_t forceKernel, energyKernel, forceEnergyKernel; hipFunction_t findBlockBoundsKernel; + hipFunction_t computeSortKeysKernel; hipFunction_t sortBoxDataKernel; hipFunction_t findInteractingBlocksKernel; hipFunction_t copyInteractionCountsKernel; diff --git a/platforms/hip/src/HipContext.cpp b/platforms/hip/src/HipContext.cpp index 771496b..3eca0e3 100644 --- a/platforms/hip/src/HipContext.cpp +++ b/platforms/hip/src/HipContext.cpp @@ -40,7 +40,9 @@ #include "HipNonbondedUtilities.h" #include "HipProgram.h" #include "HipFFTImplFFT3D.h" +#ifdef OPENMM_HIP_WITH_HIPFFT #include "HipFFTImplHipFFT.h" +#endif #include "HipFFTImplVkFFT.h" #include "openmm/common/ComputeArray.h" #include "openmm/common/ContextSelector.h" @@ -489,7 +491,7 @@ string HipContext::getHash(const string& src) const { string HipContext::getCacheFileName(const string& src) const { stringstream cacheFile; - cacheFile << cacheDir << getHash(src) << '_' << gpuArchitecture; + cacheFile << cacheDir << "openmm-hip-" << getHash(src + gpuArchitecture); return cacheFile.str(); } @@ -500,7 +502,7 @@ hipModule_t HipContext::createModule(const string source) { hipModule_t HipContext::createModule(const string source, const map& defines) { const char* saveTempsEnv = getenv("OPENMM_SAVE_TEMPS"); bool saveTemps = saveTempsEnv != nullptr; - string options = "-ffast-math -munsafe-fp-atomics -Wall"; + string options = "-O3 -ffast-math -munsafe-fp-atomics -Wall"; // HIP-TODO: Remove it when the compiler does a better job // Disable SLP vectorization as it may generate unoptimal packed math instructions on >=MI200 // (gfx90a): more v_mov, higher register usage etc. @@ -594,35 +596,29 @@ hipModule_t HipContext::createModule(const string source, const map ptx = compilerKernel.getAs().createModule(src.str(), options, *this); + vector code = compilerKernel.getAs().createModule(src.str(), options, *this); // If possible, write the PTX out to a temporary file so we can cache it for later use. - bool wroteCache = false; try { - ofstream out(outputFile.c_str()); - out.write(&ptx[0], ptx.size()); + ofstream out(cacheFile.c_str(), ios::out | ios::binary); + out.write(&code[0], code.size()); out.close(); - if (!out.fail()) - wroteCache = true; } catch (...) { + // An error occurred. Possibly we don't have permission to write to the temp directory. // Ignore. } - if (!wroteCache) { - // An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly. - - CHECK_RESULT2(hipModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading HIP module"); - loadedModules.push_back(module); - return module; - } + CHECK_RESULT2(hipModuleLoadDataEx(&module, &code[0], 0, NULL, NULL), "Error loading HIP module"); + loadedModules.push_back(module); + return module; } else { // Write out the source to a temporary file. @@ -630,48 +626,55 @@ hipModule_t HipContext::createModule(const string source, const map \""+logFile+"\""; + string command = compiler + " -x hip --offload-device-only --offload-arch=" + gpuArchitecture + " " + options + (saveTemps ? " -save-temps=obj" : "") +" -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\""; res = std::system(command.c_str()); - } - try { - if (res != 0) { - // Load the error log. - - stringstream error; - error << "Error launching HIP compiler: " << res; - ifstream log(logFile.c_str()); - if (log.is_open()) { - string line; - while (!log.eof()) { - getline(log, line); - error << '\n' << line; + try { + if (res != 0) { + // Load the error log. + + stringstream error; + error << "Error launching HIP compiler: " << res; + ifstream log(logFile.c_str()); + if (log.is_open()) { + string line; + while (!log.eof()) { + getline(log, line); + error << '\n' << line; + } + log.close(); } - log.close(); + throw OpenMMException(error.str()); } - throw OpenMMException(error.str()); - } - hipError_t result = hipModuleLoad(&module, outputFile.c_str()); - if (result != hipSuccess) { - std::stringstream m; - m<<"Error loading HIP module: "< code; + ifstream out(outputFile.c_str(), ios::in | ios::binary); + if (!out.is_open()) { + std::stringstream error; + error << "Error reading HIP module from `" << outputFile << "`"; + throw OpenMMException(error.str()); + } + code.insert(code.begin(), istreambuf_iterator(out), istreambuf_iterator()); + out.close(); + + if (!saveTemps) { + remove(inputFile.c_str()); + remove(logFile.c_str()); + } + if (rename(outputFile.c_str(), cacheFile.c_str()) != 0 && !saveTemps) + remove(outputFile.c_str()); + + CHECK_RESULT2(hipModuleLoadDataEx(&module, &code[0], 0, NULL, NULL), "Error loading HIP module"); + loadedModules.push_back(module); + return module; } - if (rename(outputFile.c_str(), cacheFile.c_str()) != 0 && !saveTemps) - remove(outputFile.c_str()); - loadedModules.push_back(module); - return module; - } - catch (...) { - if (!saveTemps) { - remove(inputFile.c_str()); - remove(outputFile.c_str()); - remove(logFile.c_str()); + catch (...) { + if (!saveTemps) { + remove(inputFile.c_str()); + remove(outputFile.c_str()); + remove(logFile.c_str()); + } + throw; } - throw; } } @@ -708,7 +711,11 @@ ComputeEvent HipContext::createEvent() { HipFFTBase* HipContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex, hipStream_t stream, HipArray& in, HipArray& out) { if (fftBackend == 1) { +#ifdef OPENMM_HIP_WITH_HIPFFT return new HipFFTImplHipFFT(*this, xsize, ysize, zsize, realToComplex, stream, in, out); +#else + throw OpenMMException("OpenMM HIP is not built with hipFFT support"); +#endif } else if (fftBackend == 2) { return new HipFFTImplVkFFT(*this, xsize, ysize, zsize, realToComplex, stream, in, out); @@ -720,7 +727,9 @@ HipFFTBase* HipContext::createFFT(int xsize, int ysize, int zsize, bool realToCo int HipContext::findLegalFFTDimension(int minimum) { if (fftBackend == 1) { +#ifdef OPENMM_HIP_WITH_HIPFFT return HipFFTImplHipFFT::findLegalDimension(minimum); +#endif } else if (fftBackend == 2) { return HipFFTImplVkFFT::findLegalDimension(minimum); @@ -904,24 +913,20 @@ vector HipContext::getDevicePrecedence() { int numDevices; hipDeviceProp_t thisDevice; string errorMessage = "Error initializing Context"; - vector, int> > devices; + vector > devices; CHECK_RESULT(hipGetDeviceCount(&numDevices)); for (int i = 0; i < numDevices; i++) { CHECK_RESULT(hipGetDeviceProperties(&thisDevice, i)); int clock, multiprocessors, speed; // AMD GPU - // gcn arch is available if needed, however... - int major = thisDevice.gcnArch; clock = thisDevice.clockRate; multiprocessors = thisDevice.multiProcessorCount; speed = clock*multiprocessors; - pair deviceProperties = std::make_pair(major, speed); - devices.push_back(std::make_pair(deviceProperties, -i)); + devices.push_back(std::make_pair(speed, -i)); } - // sort first by compute capability (higher is better), then speed - // (higher is better), and finally device index (lower is better) + // sort first by speed (higher is better), and finally device index (lower is better) std::sort(devices.begin(), devices.end()); std::reverse(devices.begin(), devices.end()); diff --git a/platforms/hip/src/HipFFTImplHipFFT.cpp b/platforms/hip/src/HipFFTImplHipFFT.cpp index 6adebdf..ecf6f16 100644 --- a/platforms/hip/src/HipFFTImplHipFFT.cpp +++ b/platforms/hip/src/HipFFTImplHipFFT.cpp @@ -26,6 +26,8 @@ * along with this program. If not, see . * * -------------------------------------------------------------------------- */ +#ifdef OPENMM_HIP_WITH_HIPFFT + #include "HipFFTImplHipFFT.h" #include "HipContext.h" @@ -121,3 +123,5 @@ int HipFFTImplHipFFT::findLegalDimension(int minimum) { minimum++; } } + +#endif // OPENMM_HIP_WITH_HIPFFT diff --git a/platforms/hip/src/HipKernelFactory.cpp b/platforms/hip/src/HipKernelFactory.cpp index 9ef58a8..f75d1fe 100644 --- a/platforms/hip/src/HipKernelFactory.cpp +++ b/platforms/hip/src/HipKernelFactory.cpp @@ -6,7 +6,7 @@ * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * - * Portions copyright (c) 2008-2019 Stanford University and the Authors. * + * Portions copyright (c) 2008-2023 Stanford University and the Authors. * * Portions copyright (C) 2020 Advanced Micro Devices, Inc. All Rights * * Reserved. * * Authors: Peter Eastman, Nicholas Curtis * @@ -113,6 +113,8 @@ KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform& return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem()); if (name == CalcCustomCVForceKernel::Name()) return new HipCalcCustomCVForceKernel(name, platform, cu); + if (name == CalcCustomCPPForceKernel::Name()) + return new CommonCalcCustomCPPForceKernel(name, platform, context, cu); if (name == CalcRMSDForceKernel::Name()) return new CommonCalcRMSDForceKernel(name, platform, cu); if (name == CalcCustomManyParticleForceKernel::Name()) @@ -141,5 +143,7 @@ KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform& return new CommonApplyMonteCarloBarostatKernel(name, platform, cu); if (name == RemoveCMMotionKernel::Name()) return new CommonRemoveCMMotionKernel(name, platform, cu); + if (name == CalcATMForceKernel::Name() ) + return new HipCalcATMForceKernel(name, platform, cu); throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str()); } diff --git a/platforms/hip/src/HipKernels.cpp b/platforms/hip/src/HipKernels.cpp index fc816e4..df2cf69 100644 --- a/platforms/hip/src/HipKernels.cpp +++ b/platforms/hip/src/HipKernels.cpp @@ -1001,7 +1001,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond } source = cu.replaceStrings(source, replacements); if (force.getIncludeDirectSpace()) - cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true); + cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true); // Initialize the exceptions. diff --git a/platforms/hip/src/HipNonbondedUtilities.cpp b/platforms/hip/src/HipNonbondedUtilities.cpp index 5dd6fe0..4307c0d 100644 --- a/platforms/hip/src/HipNonbondedUtilities.cpp +++ b/platforms/hip/src/HipNonbondedUtilities.cpp @@ -6,7 +6,7 @@ * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * - * Portions copyright (c) 2009-2022 Stanford University and the Authors. * + * Portions copyright (c) 2009-2023 Stanford University and the Authors. * * Portions copyright (C) 2020-2023 Advanced Micro Devices, Inc. All Rights * * Reserved. * * Authors: Peter Eastman, Nicholas Curtis * @@ -51,21 +51,18 @@ using namespace std; class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait { public: - BlockSortTrait(bool useDouble) : useDouble(useDouble) { - } - int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);} - int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);} - const char* getDataType() const {return "real2";} - const char* getKeyType() const {return "real";} - const char* getMinKey() const {return "-3.40282e+38f";} - const char* getMaxKey() const {return "3.40282e+38f";} - const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";} - const char* getSortKey() const {return "value.x";} -private: - bool useDouble; + BlockSortTrait() {} + int getDataSize() const {return sizeof(int);} + int getKeySize() const {return sizeof(int);} + const char* getDataType() const {return "unsigned int";} + const char* getKeyType() const {return "unsigned int";} + const char* getMinKey() const {return "0";} + const char* getMaxKey() const {return "0xFFFFFFFFu";} + const char* getMaxValue() const {return "0xFFFFFFFFu";} + const char* getSortKey() const {return "value";} }; -HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true), +HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) { // Decide how many thread blocks to use. @@ -75,6 +72,13 @@ HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(cont numForceThreadBlocks = 5*4*context.getMultiprocessors(); forceThreadBlockSize = 64; findInteractingBlocksThreadBlockSize = context.getSIMDWidth(); + + // When building the neighbor list, we can optionally use large blocks (32 * warpSize atoms) to + // accelerate the process. This makes building the neighbor list faster, but it prevents + // us from sorting atom blocks by size, which leads to a slightly less efficient neighbor + // list. We guess based on system size which will be faster. + + useLargeBlocks = (context.getNumAtoms() > 90000); setKernelSource(HipKernelSources::nonbonded); } @@ -86,11 +90,11 @@ HipNonbondedUtilities::~HipNonbondedUtilities() { hipEventDestroy(downloadCountEvent); } -void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup) { - addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false); +void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList) { + addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, usesNeighborList, false); } -void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) { +void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList) { if (groupCutoff.size() > 0) { if (usesCutoff != useCutoff) throw OpenMMException("All Forces must agree on whether to use a cutoff"); @@ -103,6 +107,7 @@ void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, b requestExclusions(exclusionList); useCutoff = usesCutoff; usePeriodic = usesPeriodic; + useNeighborList |= (usesNeighborList && useCutoff); groupCutoff[forceGroup] = cutoffDistance; groupFlags |= 1< exclusionTilesVec; for (set >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) exclusionTilesVec.push_back(make_int2(iter->first, iter->second)); - sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useCutoff ? compareInt2 : compareInt2LargeSIMD); + sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useNeighborList ? compareInt2 : compareInt2LargeSIMD); exclusionTiles.initialize(context, exclusionTilesVec.size(), "exclusionTiles"); exclusionTiles.upload(exclusionTilesVec); map, int> exclusionTileMap; @@ -291,15 +296,23 @@ void HipNonbondedUtilities::initialize(const System& system) { int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter"); blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox"); - sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks"); + sortedBlocks.initialize(context, numAtomBlocks, "sortedBlocks"); sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter"); sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); + blockSizeRange.initialize(context, 2, elementSize, "blockSizeRange"); + largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter"); + largeBlockBoundingBox.initialize(context, numAtomBlocks*4, elementSize, "largeBlockBoundingBox"); oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions"); rebuildNeighborList.initialize(context, 1, "rebuildNeighborList"); - blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false); + blockSorter = new HipSort(context, new BlockSortTrait(), numAtomBlocks, false); vector count(2, 0); interactionCount.upload(count); rebuildNeighborList.upload(&count[0]); + if (context.getUseDoublePrecision()) { + blockSizeRange.upload(vector{1e38, 0}); + } else { + blockSizeRange.upload(vector{1e38, 0}); + } } // Record arguments for kernels. @@ -343,17 +356,30 @@ void HipNonbondedUtilities::initialize(const System& system) { findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer()); findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer()); findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer()); - findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer()); + findBlockBoundsArgs.push_back(&blockSizeRange.getDevicePointer()); + computeSortKeysArgs.push_back(&blockBoundingBox.getDevicePointer()); + computeSortKeysArgs.push_back(&sortedBlocks.getDevicePointer()); + computeSortKeysArgs.push_back(&blockSizeRange.getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer()); sortBoxDataArgs.push_back(&blockCenter.getDevicePointer()); sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer()); + if (useLargeBlocks) { + sortBoxDataArgs.push_back(&largeBlockCenter.getDevicePointer()); + sortBoxDataArgs.push_back(&largeBlockBoundingBox.getDevicePointer()); + sortBoxDataArgs.push_back(context.getPeriodicBoxSizePointer()); + sortBoxDataArgs.push_back(context.getInvPeriodicBoxSizePointer()); + sortBoxDataArgs.push_back(context.getPeriodicBoxVecXPointer()); + sortBoxDataArgs.push_back(context.getPeriodicBoxVecYPointer()); + sortBoxDataArgs.push_back(context.getPeriodicBoxVecZPointer()); + } sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer()); sortBoxDataArgs.push_back(&oldPositions.getDevicePointer()); sortBoxDataArgs.push_back(&interactionCount.getDevicePointer()); sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer()); sortBoxDataArgs.push_back(&forceRebuildNeighborList); + sortBoxDataArgs.push_back(&blockSizeRange.getDevicePointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer()); @@ -371,6 +397,10 @@ void HipNonbondedUtilities::initialize(const System& system) { findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer()); + if (useLargeBlocks) { + findInteractingBlocksArgs.push_back(&largeBlockCenter.getDevicePointer()); + findInteractingBlocksArgs.push_back(&largeBlockBoundingBox.getDevicePointer()); + } findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer()); findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer()); findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer()); @@ -398,23 +428,24 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) { return; if (groupKernels.find(forceGroups) == groupKernels.end()) createKernelsForGroups(forceGroups); - if (!useCutoff) - return; - if (numTiles == 0) - return; KernelSet& kernels = groupKernels[forceGroups]; - if (usePeriodic) { + if (useCutoff && usePeriodic) { double4 box = context.getPeriodicBoxSize(); double minAllowedSize = 1.999999*kernels.cutoffDistance; if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize) throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff."); } + if (!useNeighborList) + return; + if (numTiles == 0) + return; // Compute the neighbor list. if (lastCutoff != kernels.cutoffDistance) forceRebuildNeighborList = true; context.executeKernelFlat(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getPaddedNumAtoms(), context.getSIMDWidth()); + context.executeKernelFlat(kernels.computeSortKeysKernel, &computeSortKeysArgs[0], context.getNumAtomBlocks()); blockSorter->sort(sortedBlocks); context.executeKernelFlat(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms(), 64); context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize); @@ -434,7 +465,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy); context.executeKernelFlat(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); } - if (useCutoff && numTiles > 0) { + if (useNeighborList && numTiles > 0) { hipEventSynchronize(downloadCountEvent); updateNeighborListSize(); } @@ -523,6 +554,8 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) { defines["USE_PERIODIC"] = "1"; if (context.getBoxIsTriclinic()) defines["TRICLINIC"] = "1"; + if (useLargeBlocks) + defines["USE_LARGE_BLOCKS"] = "1"; defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions); int maxBits = 0; if (canUsePairList) { @@ -551,8 +584,14 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) { defines["MAX_BITS_FOR_PAIRS"] = context.intToString(maxBits); defines["NUM_TILES_IN_BATCH"] = context.intToString(numTilesInBatch); defines["GROUP_SIZE"] = context.intToString(findInteractingBlocksThreadBlockSize); + int binShift = 1; + while (1<getMinKey(); replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_VALUE"] = trait->getMaxValue(); + replacements["UNIFORM"] = (uniform ? "1" : "0"); hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements)); shortListKernel = context.getKernel(module, "sortShortList"); shortList2Kernel = context.getKernel(module, "sortShortList2"); computeRangeKernel = context.getKernel(module, "computeRange"); - assignElementsKernel = context.getKernel(module, "assignElementsToBuckets"); + assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2"); computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions"); copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets"); sortBucketsKernel = context.getKernel(module, "sortBuckets"); @@ -59,7 +60,7 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo int maxSharedMem; hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice()); int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2; - int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks())); + int maxShortList = min(1024, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks())); isShortList = (length <= maxShortList); sortKernelSize = 256; rangeKernelSize = 256; diff --git a/platforms/hip/src/kernels/findInteractingBlocks.hip b/platforms/hip/src/kernels/findInteractingBlocks.hip index 65db0f8..4a095ed 100644 --- a/platforms/hip/src/kernels/findInteractingBlocks.hip +++ b/platforms/hip/src/kernels/findInteractingBlocks.hip @@ -1,3 +1,5 @@ +#include + #define BUFFER_SIZE 256 #if defined(AMD_RDNA) @@ -18,12 +20,27 @@ __device__ inline int warpPopc(warpflags x) { #endif +struct alignas(sizeof(__half) * 4) BoundingBox { + __device__ BoundingBox(real3 f) { + // Round up so we'll err on the side of making the box a little too large. + // This ensures interactions will never be missed. + v[0] = __float2half_ru((float) f.x); + v[1] = __float2half_ru((float) f.y); + v[2] = __float2half_ru((float) f.z); + } + __device__ real3 toReal3() const { + return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2])); + } +private: + __half v[3]; +}; + /** * Find a bounding box for the atoms in each block. */ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList, - real2* __restrict__ sortedBlocks) { + real2* __restrict__ blockSizeRange) { const int indexInTile = threadIdx.x%TILE_SIZE; const int index = warpSize == TILE_SIZE ? blockIdx.x : (blockIdx.x*(warpSize/TILE_SIZE) + threadIdx.x/TILE_SIZE); const int base = index * TILE_SIZE; @@ -97,28 +114,77 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, center.w = SQRT(tcenter); blockBoundingBox[index] = blockSize; blockCenter[index] = center; - // blockSize.x+blockSize.y+blockSize.z has a distibution that looks like a normal distribution. - // This causes HipSort's buckets to have very non-uniform sizes, so a few very long buckets are - // sorted in global memory. -1/max(x, y, z) or -1/(x+y+z) have a "faster" distribution. - sortedBlocks[index] = make_real2(-RECIP(max(max(blockSize.x, blockSize.y), blockSize.z)), index); + + // Record the range of sizes. + real totalSize = blockSize.x+blockSize.y+blockSize.z; + atomicMin(&blockSizeRange->x, totalSize); + atomicMax(&blockSizeRange->y, totalSize); } if (blockIdx.x == 0 && threadIdx.x == 0) rebuildNeighborList[0] = 0; } +extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, const real2* __restrict__ blockSizeRange/*, int numSizes*/) { + // Sort keys store the bin in the high order part and the block in the low + // order part. + + real2 sizeRange = make_real2(LOG(blockSizeRange->x), LOG(blockSizeRange->y)); + int numSizeBins = 20; + real scale = numSizeBins/(sizeRange.y-sizeRange.x); + unsigned int i = threadIdx.x+blockIdx.x*blockDim.x; + if (i < NUM_BLOCKS) { + real4 box = blockBoundingBox[i]; + real size = LOG(box.x+box.y+box.z); + int bin = (size-sizeRange.x)*scale; + bin = max(0, min(bin, numSizeBins-1)); + sortedBlocks[i] = (((unsigned int) bin)<x = 1e38; + blockSizeRange->y = 0; } // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list. @@ -241,8 +307,12 @@ void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned i extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, - unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, - const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, + unsigned int startBlockIndex, unsigned int numBlocks, const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, + const BoundingBox* __restrict__ sortedBlockBoundingBox, +#ifdef USE_LARGE_BLOCKS + const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox, +#endif + const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) { if (rebuildNeighborList[0] == 0) @@ -273,10 +343,9 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti if (block1 < startBlockIndex+numBlocks) { // Load data for this block. Note that all threads in a warp are processing the same block. - real2 sortedKey = sortedBlocks[block1]; - int x = (int) sortedKey.y; + int x = sortedBlocks[block1] & BLOCK_INDEX_MASK; real4 blockCenterX = sortedBlockCenter[block1]; - real4 blockSizeX = sortedBlockBoundingBox[block1]; + real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3(); int neighborsInBuffer = 0; real4 pos1 = posq[x*TILE_SIZE+indexInTile]; #ifdef USE_PERIODIC @@ -311,10 +380,52 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti // units are idle at the end of the kernel (the kernel works on the upper triangle of // the NUM_BLOCKS x NUM_BLOCKS matrix). +#ifdef USE_LARGE_BLOCKS + warpflags largeBlockFlags = 0; + int loadedLargeBlocks = 0; +#endif int block2Count = 0; // Load blocks from addresses aligned by warpSize for faster loading from sortedBlockCenter and sortedBlockBoundingBox. for (int block2Base = ((block1+1)/warpSize + warpIndex%NUM_TILES_IN_BATCH)*warpSize; block2Base < NUM_BLOCKS; block2Base += warpSize*NUM_TILES_IN_BATCH) { + // Last iteration cannot be skipped (on CDNA where tilesPerWarp == 2) const bool lastIteration = block2Base + warpSize*NUM_TILES_IN_BATCH >= NUM_BLOCKS; +#ifdef USE_LARGE_BLOCKS + if (loadedLargeBlocks == 0) { + // Check the next set of large blocks. + + int largeBlockIndex = block2Base + warpSize*NUM_TILES_IN_BATCH*indexInWarp; + bool includeLargeBlock = false; + if (largeBlockIndex < NUM_BLOCKS) { + real4 largeCenter = largeBlockCenter[largeBlockIndex]; + real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3(); + real4 blockDelta = blockCenterX-largeCenter; +#ifdef USE_PERIODIC + APPLY_PERIODIC_TO_DELTA(blockDelta) +#endif + blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x); + blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y); + blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z); + includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED); +#ifdef TRICLINIC + // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away. + // If there's any possibility we might have missed it, do a detailed check. + + if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF) + includeLargeBlock = true; +#endif + } + largeBlockFlags = __ballot(includeLargeBlock); + loadedLargeBlocks = warpSize; + } + loadedLargeBlocks--; + if ((largeBlockFlags&1) == 0 && !lastIteration) { + // None of the next warpSize blocks interact with block 1. + + largeBlockFlags >>= 1; + continue; + } + largeBlockFlags >>= 1; +#endif int block2 = block2Base+indexInWarp; bool includeBlock2 = (block1 < block2 && block2 < NUM_BLOCKS); block2 = includeBlock2 ? block2 : block1; @@ -329,7 +440,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti if (!lastIteration && __ballot(includeBlock2) == 0) continue; #endif - real4 blockSizeY = sortedBlockBoundingBox[block2]; + real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3(); blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x); blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y); blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z); @@ -361,7 +472,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti const int b = block2Buffer[min(block2Index + tileInWarp, block2Count - 1)]; const bool forceInclude = b & 1; const int block2 = b >> 1; - int y = (int) sortedBlocks[block2].y; + int y = sortedBlocks[block2] & BLOCK_INDEX_MASK; #pragma unroll 1 for (int k = indexInTile; k < numExclusions; k += TILE_SIZE) diff --git a/platforms/hip/src/kernels/nonbonded.hip b/platforms/hip/src/kernels/nonbonded.hip index d6e1716..9cbaae0 100644 --- a/platforms/hip/src/kernels/nonbonded.hip +++ b/platforms/hip/src/kernels/nonbonded.hip @@ -237,7 +237,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // of them (no cutoff). -#ifdef USE_CUTOFF +#ifdef USE_NEIGHBOR_LIST const unsigned int numTiles = interactionCount[0]; if (numTiles > maxTiles) return; // There wasn't enough memory for the neighbor list. @@ -262,7 +262,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded // Extract the coordinates of this tile. int x, y; bool singlePeriodicCopy = false; -#ifdef USE_CUTOFF +#ifdef USE_NEIGHBOR_LIST x = tiles[pos]; real4 blockSizeX = blockSize[x]; singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF && @@ -297,7 +297,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded // Load atom data for this tile. real4 posq1 = posq[atom1]; LOAD_ATOM1_PARAMETERS -#ifdef USE_CUTOFF +#ifdef USE_NEIGHBOR_LIST unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx]; #else unsigned int j = y*TILE_SIZE + tgx; @@ -454,7 +454,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded // Third loop: single pairs that aren't part of a tile. -#if USE_CUTOFF +#if USE_NEIGHBOR_LIST const unsigned int numPairs = interactionCount[1]; if (numPairs > maxSinglePairs) return; // There wasn't enough memory for the neighbor list. diff --git a/platforms/hip/src/kernels/sort.hip b/platforms/hip/src/kernels/sort.hip index 288c629..3efe82d 100644 --- a/platforms/hip/src/kernels/sort.hip +++ b/platforms/hip/src/kernels/sort.hip @@ -95,6 +95,7 @@ inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE */ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, volatile KEY_TYPE* __restrict__ range, unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) { +#if UNIFORM extern __shared__ KEY_TYPE minBuffer[]; KEY_TYPE* maxBuffer = minBuffer+blockDim.x; KEY_TYPE minimum = MAX_KEY; @@ -131,6 +132,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le } reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]); } +#endif // Clear the bucket counters in preparation for the next kernel. @@ -139,7 +141,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le } /** - * Assign elements to buckets. + * Assign elements to buckets. This version is optimized for uniformly distributed data. */ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) { @@ -154,6 +156,86 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi } } + +/** + * Assign elements to buckets. This version is optimized for non-uniformly distributed data. + */ +__global__ void assignElementsToBuckets2(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range, + unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) { + // Load 64 datapoints and sort them to get an estimate of the data distribution. + + __shared__ KEY_TYPE elements[64]; + if (threadIdx.x < 64) { + int index = (int) (threadIdx.x*length/64.0); + elements[threadIdx.x] = getValue(data[index]); + } + __syncthreads(); + for (unsigned int k = 2; k <= 64; k *= 2) { + for (unsigned int j = k/2; j > 0; j /= 2) { + if (threadIdx.x < 64) { + int ixj = threadIdx.x^j; + if (ixj > threadIdx.x) { + KEY_TYPE value1 = elements[threadIdx.x]; + KEY_TYPE value2 = elements[ixj]; + bool ascending = (threadIdx.x&k) == 0; + KEY_TYPE lowKey = (ascending ? value1 : value2); + KEY_TYPE highKey = (ascending ? value2 : value1); + if (lowKey > highKey) { + elements[threadIdx.x] = value2; + elements[ixj] = value1; + } + } + } + __syncthreads(); + } + } + + // Create a function composed of linear segments mapping data values to bucket indices. + + __shared__ float segmentLowerBound[9]; + __shared__ float segmentBaseIndex[9]; + __shared__ float segmentIndexScale[9]; + if (threadIdx.x == 0) { + segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]); + segmentLowerBound[1] = elements[5]; + segmentLowerBound[2] = elements[10]; + segmentLowerBound[3] = elements[20]; + segmentLowerBound[4] = elements[30]; + segmentLowerBound[5] = elements[40]; + segmentLowerBound[6] = elements[50]; + segmentLowerBound[7] = elements[60]; + segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]); + segmentBaseIndex[0] = numBuckets/16; + segmentBaseIndex[1] = 3*numBuckets/16; + segmentBaseIndex[2] = 5*numBuckets/16; + segmentBaseIndex[3] = 7*numBuckets/16; + segmentBaseIndex[4] = 9*numBuckets/16; + segmentBaseIndex[5] = 11*numBuckets/16; + segmentBaseIndex[6] = 13*numBuckets/16; + segmentBaseIndex[7] = 15*numBuckets/16; + segmentBaseIndex[8] = numBuckets; + for (int i = 0; i < 8; i++) + if (segmentLowerBound[i+1] == segmentLowerBound[i]) + segmentIndexScale[i] = 0; + else + segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]); + } + __syncthreads(); + + // Assign elements to buckets. + + for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) { + float key = (float) getValue(data[index]); + int segment; + for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++) + ; + unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment]; + bucketIndex = min(max(0, bucketIndex), numBuckets-1); + offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1); + bucketOfElement[index] = bucketIndex; + } +} + /** * Sum the bucket sizes to compute the start position of each bucket. This kernel * is executed as a single work group. diff --git a/platforms/hip/tests/TestHipATMForce.cpp b/platforms/hip/tests/TestHipATMForce.cpp new file mode 100644 index 0000000..6a594fc --- /dev/null +++ b/platforms/hip/tests/TestHipATMForce.cpp @@ -0,0 +1,36 @@ +/* -------------------------------------------------------------------------- * + * OpenMM * + * -------------------------------------------------------------------------- * + * This is part of the OpenMM molecular simulation toolkit originating from * + * Simbios, the NIH National Center for Physics-Based Simulation of * + * Biological Structures at Stanford, funded under the NIH Roadmap for * + * Medical Research, grant U54 GM072970. See https://simtk.org. * + * * + * Portions copyright (c) 2023 Stanford University and the Authors. * + * Authors: Peter Eastman * + * Contributors: * + * * + * Permission is hereby granted, free of charge, to any person obtaining a * + * copy of this software and associated documentation files (the "Software"), * + * to deal in the Software without restriction, including without limitation * + * the rights to use, copy, modify, merge, publish, distribute, sublicense, * + * and/or sell copies of the Software, and to permit persons to whom the * + * Software is furnished to do so, subject to the following conditions: * + * * + * The above copyright notice and this permission notice shall be included in * + * all copies or substantial portions of the Software. * + * * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * + * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * + * USE OR OTHER DEALINGS IN THE SOFTWARE. * + * -------------------------------------------------------------------------- */ + +#include "HipTests.h" +#include "TestATMForce.h" + +void runPlatformTests() { +} diff --git a/platforms/hip/tests/TestHipCustomCPPForce.cpp b/platforms/hip/tests/TestHipCustomCPPForce.cpp new file mode 100644 index 0000000..7910447 --- /dev/null +++ b/platforms/hip/tests/TestHipCustomCPPForce.cpp @@ -0,0 +1,36 @@ +/* -------------------------------------------------------------------------- * + * OpenMM * + * -------------------------------------------------------------------------- * + * This is part of the OpenMM molecular simulation toolkit originating from * + * Simbios, the NIH National Center for Physics-Based Simulation of * + * Biological Structures at Stanford, funded under the NIH Roadmap for * + * Medical Research, grant U54 GM072970. See https://simtk.org. * + * * + * Portions copyright (c) 2023 Stanford University and the Authors. * + * Authors: Peter Eastman * + * Contributors: * + * * + * Permission is hereby granted, free of charge, to any person obtaining a * + * copy of this software and associated documentation files (the "Software"), * + * to deal in the Software without restriction, including without limitation * + * the rights to use, copy, modify, merge, publish, distribute, sublicense, * + * and/or sell copies of the Software, and to permit persons to whom the * + * Software is furnished to do so, subject to the following conditions: * + * * + * The above copyright notice and this permission notice shall be included in * + * all copies or substantial portions of the Software. * + * * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * + * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * + * USE OR OTHER DEALINGS IN THE SOFTWARE. * + * -------------------------------------------------------------------------- */ + +#include "HipTests.h" +#include "TestCustomCPPForce.h" + +void runPlatformTests() { +} diff --git a/platforms/hip/tests/TestHipFFTImplHipFFT.cpp b/platforms/hip/tests/TestHipFFTImplHipFFT.cpp index 9753761..f404a3f 100644 --- a/platforms/hip/tests/TestHipFFTImplHipFFT.cpp +++ b/platforms/hip/tests/TestHipFFTImplHipFFT.cpp @@ -35,6 +35,8 @@ * This tests the hipFFT-based implementation of FFT. */ +#ifdef OPENMM_HIP_WITH_HIPFFT + #include "openmm/internal/AssertionUtilities.h" #include "HipArray.h" #include "HipContext.h" @@ -169,3 +171,11 @@ int main(int argc, char* argv[]) { cout << "Done" << endl; return 0; } + +#else // OPENMM_HIP_WITH_HIPFFT + +int main(int argc, char* argv[]) { + return 0; +} + +#endif // OPENMM_HIP_WITH_HIPFFT