diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbb913e..dd8755e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,13 +87,7 @@ ENDFOREACH(subdir)
 LIST(APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH} /opt/rocm)
 
 FIND_PACKAGE(HIP CONFIG REQUIRED)
-FIND_PACKAGE(HIPFFT CONFIG REQUIRED)
-
-IF(${HIP_COMPILER} STREQUAL "clang")
-    MESSAGE(STATUS "Using HIP-Clang compiler")
-ELSE()
-    MESSAGE(FATAL_ERROR "HIP compiler ${HIP_COMPILER} not recognized!")
-ENDIF()
+FIND_PACKAGE(HIPFFT CONFIG)
 
 ADD_SUBDIRECTORY(platforms/hip)
 
diff --git a/README.md b/README.md
index 3bf46df..f148c6c 100644
--- a/README.md
+++ b/README.md
@@ -5,12 +5,6 @@ AMD GPUs on [AMD ROCm™ open software platform](https://rocmdocs.amd.com).
 
 ## Installing with Conda
 
-This plugin requires hipFFT and rocFFT, install them from ROCm repositories:
-
-```sh
-apt install hipfft rocfft
-```
-
 ```sh
 conda create -n openmm-env -c streamhpc -c conda-forge --strict-channel-priority openmm-hip
 conda activate openmm-env
@@ -69,7 +63,7 @@ The plugin requires source code of OpenMM, it can be downloaded as an archive
 [here](https://github.com/openmm/openmm/releases) or as a Git repository:
 
 ```sh
-git clone https://github.com/openmm/openmm.git -b 8.0.0
+git clone https://github.com/openmm/openmm.git -b 8.1.1
 ```
 
 To build the plugin, follow these steps:
@@ -100,7 +94,7 @@ source code:
 ```sh
 mkdir build build-hip install
 
-git clone https://github.com/openmm/openmm.git -b 8.0.0
+git clone https://github.com/openmm/openmm.git -b 8.1.1
 cd build
 cmake ../openmm/ -D CMAKE_INSTALL_PREFIX=../install -D OPENMM_BUILD_COMMON=ON -D OPENMM_PYTHON_USER_INSTALL=ON
 make
@@ -138,15 +132,25 @@ please try different backends:
 * the hipFFT/rocFFT-based implementation (`export OPENMM_FFT_BACKEND=1`);
 * the VkFFT-based implementation (`export OPENMM_FFT_BACKEND=2`);
 
-### The kernel compilation: hipcc and hipRTC
+The hipFFT/rocFFT-based implementation requires hipFFT and rocFFT libraries, otherwise it will be
+disabled, install them from ROCm repositories before running cmake:
+
+```sh
+apt install hipfft rocfft
+```
 
-By default, the HIP Platform builds kernels with the hipcc compiler. To run the compiler, paths
-in the following order are used:
+If you see "libhipfft.so.0: cannot open shared object file: No such file or directory", run
+`ldconfig`.
+
+### The kernel compilation: amdclang++ and hipRTC
+
+By default, the HIP Platform builds kernels with the amdclang++ compiler. To run the compiler,
+paths in the following order are used:
 
 * `properties['HipCompiler']`, if it is passed to Context constructor;
 * `OPENMM_HIP_COMPILER` environment variable, if it is set;
-* `${ROCM_PATH}/bin/hipcc`, if `ROCM_PATH` environment variable is set;
-* `/opt/rocm/bin/hipcc` otherwise.
+* `${ROCM_PATH}/bin/amdclang++`, if `ROCM_PATH` environment variable is set;
+* `/opt/rocm/bin/amdclang++` otherwise.
 
 There is an alternative way to compile kernels: hipRTC, it is implemented by
 `plugins/hipcompiler`.  To enable this way:
@@ -154,6 +158,10 @@ There is an alternative way to compile kernels: hipRTC, it is implemented by
 * set `properties['HipAllowRuntimeCompiler'] = 'true'`;
 * set `OPENMM_USE_HIPRTC` environment variable to 1 (`export OPENMM_USE_HIPRTC=1`).
 
+**Warning:** hipRTC from ROCm 6.0.0 has issues with ambiguous operators for vector and complex
+types. It seems that they have been fixed in ROCm/clr's `develop` branch and likely OpenMM+hipRTC
+will be usable with the next ROCm release.
+
 ## License
 
 The HIP Platform uses OpenMM API under the terms of the MIT License.  A copy of this license may
diff --git a/platforms/hip/CMakeLists.txt b/platforms/hip/CMakeLists.txt
index 866336a..8380625 100644
--- a/platforms/hip/CMakeLists.txt
+++ b/platforms/hip/CMakeLists.txt
@@ -110,13 +110,17 @@ IF (OPENMM_BUILD_SHARED_LIB)
     ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
     ADD_DEPENDENCIES(${SHARED_TARGET} CommonKernels HipKernels)
 
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB} hip::host hip::hipfft)
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB} hip::host)
     SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_SHARED_LIBRARY")
     IF (APPLE)
         SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework HIP")
     ELSE (APPLE)
         SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
     ENDIF (APPLE)
+    IF(HIPFFT_FOUND)
+        TARGET_LINK_LIBRARIES(${SHARED_TARGET} PUBLIC hip::hipfft)
+        TARGET_COMPILE_OPTIONS(${SHARED_TARGET} PUBLIC "-DOPENMM_HIP_WITH_HIPFFT")
+    ENDIF(HIPFFT_FOUND)
 
     INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
 ENDIF (OPENMM_BUILD_SHARED_LIB)
@@ -127,13 +131,17 @@ IF(OPENMM_BUILD_STATIC_LIB)
     ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
     ADD_DEPENDENCIES(${STATIC_TARGET} CommonKernels HipKernels)
 
-    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB_STATIC} hip::host hip::hipfft)
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB_STATIC} hip::host)
     SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_STATIC_LIBRARY")
     IF (APPLE)
         SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework HIP")
     ELSE (APPLE)
         SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
     ENDIF (APPLE)
+    IF(HIPFFT_FOUND)
+        TARGET_LINK_LIBRARIES(${STATIC_TARGET} PUBLIC hip::hipfft)
+        TARGET_COMPILE_OPTIONS(${STATIC_TARGET} PUBLIC "-DOPENMM_HIP_WITH_HIPFFT")
+    ENDIF(HIPFFT_FOUND)
 
     INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
 ENDIF(OPENMM_BUILD_STATIC_LIB)
diff --git a/platforms/hip/include/HipKernels.h b/platforms/hip/include/HipKernels.h
index 0c317b9..c1adf84 100644
--- a/platforms/hip/include/HipKernels.h
+++ b/platforms/hip/include/HipKernels.h
@@ -360,6 +360,18 @@ class HipCalcCustomCVForceKernel : public CommonCalcCustomCVForceKernel {
     }
 };
 
+/**
+ * This kernel is invoked by ATMForce to calculate the forces acting on the system and the energy of the system.
+ */
+class HipCalcATMForceKernel : public CommonCalcATMForceKernel {
+public:
+    HipCalcATMForceKernel(std::string name, const Platform& platform, ComputeContext& cc) : CommonCalcATMForceKernel(name, platform, cc) {
+    }
+    ComputeContext& getInnerComputeContext(ContextImpl& innerContext) {
+        return *reinterpret_cast<HipPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
+    }
+};
+
 } // namespace OpenMM
 
 #endif /*OPENMM_HIPKERNELS_H_*/
diff --git a/platforms/hip/include/HipNonbondedUtilities.h b/platforms/hip/include/HipNonbondedUtilities.h
index d9fc3b9..9bc6485 100644
--- a/platforms/hip/include/HipNonbondedUtilities.h
+++ b/platforms/hip/include/HipNonbondedUtilities.h
@@ -9,7 +9,7 @@
  * Biological Structures at Stanford, funded under the NIH Roadmap for        *
  * Medical Research, grant U54 GM072970. See https://simtk.org.               *
  *                                                                            *
- * Portions copyright (c) 2009-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
  * Portions copyright (C) 2020-2023 Advanced Micro Devices, Inc. All Rights   *
  * Reserved.                                                                  *
  * Authors: Peter Eastman, Nicholas Curtis                                    *
@@ -83,8 +83,10 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities  {
      * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
      * @param kernel           the code to evaluate the interaction
      * @param forceGroup       the force group in which the interaction should be calculated
+     * @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction.  This should
+     *                         be viewed as only a suggestion.  Even when it is false, a neighbor list may be used anyway.
      */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList = true);
     /**
      * Add a nonbonded interaction to be evaluated by the default interaction kernel.
      *
@@ -95,9 +97,11 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities  {
      * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
      * @param kernel           the code to evaluate the interaction
      * @param forceGroup       the force group in which the interaction should be calculated
+     * @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction.  This should
+     *                         be viewed as only a suggestion.  Even when it is false, a neighbor list may be used anyway.
      * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
      */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList);
     /**
      * Add a per-atom parameter that the default interaction kernel may depend on.
      */
@@ -336,12 +340,15 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities  {
     HipArray sortedBlocks;
     HipArray sortedBlockCenter;
     HipArray sortedBlockBoundingBox;
+    HipArray blockSizeRange;
+    HipArray largeBlockCenter;
+    HipArray largeBlockBoundingBox;
     HipArray oldPositions;
     HipArray rebuildNeighborList;
     HipSort* blockSorter;
     hipEvent_t downloadCountEvent;
     unsigned int* pinnedCountBuffer;
-    std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs;
+    std::vector<void*> forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs;
     std::vector<std::vector<int> > atomExclusions;
     std::vector<ParameterInfo> parameters;
     std::vector<ParameterInfo> arguments;
@@ -349,7 +356,7 @@ class OPENMM_EXPORT_COMMON HipNonbondedUtilities : public NonbondedUtilities  {
     std::map<int, double> groupCutoff;
     std::map<int, std::string> groupKernelSource;
     double lastCutoff;
-    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList, useLargeBlocks;
     int startTileIndex, startBlockIndex, numBlocks, numTilesInBatch, maxExclusions;
     int numForceThreadBlocks, forceThreadBlockSize, findInteractingBlocksThreadBlockSize, numAtoms, groupFlags;
     unsigned int maxTiles, maxSinglePairs, tilesAfterReorder;
@@ -368,6 +375,7 @@ class HipNonbondedUtilities::KernelSet {
     std::string source;
     hipFunction_t forceKernel, energyKernel, forceEnergyKernel;
     hipFunction_t findBlockBoundsKernel;
+    hipFunction_t computeSortKeysKernel;
     hipFunction_t sortBoxDataKernel;
     hipFunction_t findInteractingBlocksKernel;
     hipFunction_t copyInteractionCountsKernel;
diff --git a/platforms/hip/src/HipContext.cpp b/platforms/hip/src/HipContext.cpp
index 771496b..3eca0e3 100644
--- a/platforms/hip/src/HipContext.cpp
+++ b/platforms/hip/src/HipContext.cpp
@@ -40,7 +40,9 @@
 #include "HipNonbondedUtilities.h"
 #include "HipProgram.h"
 #include "HipFFTImplFFT3D.h"
+#ifdef OPENMM_HIP_WITH_HIPFFT
 #include "HipFFTImplHipFFT.h"
+#endif
 #include "HipFFTImplVkFFT.h"
 #include "openmm/common/ComputeArray.h"
 #include "openmm/common/ContextSelector.h"
@@ -489,7 +491,7 @@ string HipContext::getHash(const string& src) const {
 
 string HipContext::getCacheFileName(const string& src) const {
     stringstream cacheFile;
-    cacheFile << cacheDir << getHash(src) << '_' << gpuArchitecture;
+    cacheFile << cacheDir << "openmm-hip-" << getHash(src + gpuArchitecture);
     return cacheFile.str();
 }
 
@@ -500,7 +502,7 @@ hipModule_t HipContext::createModule(const string source) {
 hipModule_t HipContext::createModule(const string source, const map<string, string>& defines) {
     const char* saveTempsEnv = getenv("OPENMM_SAVE_TEMPS");
     bool saveTemps = saveTempsEnv != nullptr;
-    string options = "-ffast-math -munsafe-fp-atomics -Wall";
+    string options = "-O3 -ffast-math -munsafe-fp-atomics -Wall";
     // HIP-TODO: Remove it when the compiler does a better job
     // Disable SLP vectorization as it may generate unoptimal packed math instructions on >=MI200
     // (gfx90a): more v_mov, higher register usage etc.
@@ -594,35 +596,29 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
     else {
         tempFileName << getTempFileName();
     }
-    string inputFile = (tempFileName.str()+".hip.cpp");
-    string outputFile = (tempFileName.str()+".hsaco");
+    string inputFile = (tempFileName.str()+".hip");
+    string outputFile = (tempFileName.str()+".code");
     string logFile = (tempFileName.str()+".log");
     int res = 0;
 
     // If the runtime compiler plugin is available, use it.
 
     if (hasCompilerKernel) {
-        vector<char> ptx = compilerKernel.getAs<HipCompilerKernel>().createModule(src.str(), options, *this);
+        vector<char> code = compilerKernel.getAs<HipCompilerKernel>().createModule(src.str(), options, *this);
         // If possible, write the PTX out to a temporary file so we can cache it for later use.
 
-        bool wroteCache = false;
         try {
-            ofstream out(outputFile.c_str());
-            out.write(&ptx[0], ptx.size());
+            ofstream out(cacheFile.c_str(), ios::out | ios::binary);
+            out.write(&code[0], code.size());
             out.close();
-            if (!out.fail())
-                wroteCache = true;
         }
         catch (...) {
+            // An error occurred.  Possibly we don't have permission to write to the temp directory.
             // Ignore.
         }
-        if (!wroteCache) {
-            // An error occurred.  Possibly we don't have permission to write to the temp directory.  Just try to load the module directly.
-
-            CHECK_RESULT2(hipModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading HIP module");
-            loadedModules.push_back(module);
-            return module;
-        }
+        CHECK_RESULT2(hipModuleLoadDataEx(&module, &code[0], 0, NULL, NULL), "Error loading HIP module");
+        loadedModules.push_back(module);
+        return module;
     }
     else {
         // Write out the source to a temporary file.
@@ -630,48 +626,55 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
         ofstream out(inputFile.c_str());
         out << src.str();
         out.close();
-        string command = compiler + " --genco --offload-arch=" + gpuArchitecture + " " + options + (saveTemps ? " -save-temps=obj" : "") +" -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\"";
+        string command = compiler + " -x hip --offload-device-only --offload-arch=" + gpuArchitecture + " " + options + (saveTemps ? " -save-temps=obj" : "") +" -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\"";
         res = std::system(command.c_str());
-    }
-    try {
-        if (res != 0) {
-            // Load the error log.
-
-            stringstream error;
-            error << "Error launching HIP compiler: " << res;
-            ifstream log(logFile.c_str());
-            if (log.is_open()) {
-                string line;
-                while (!log.eof()) {
-                    getline(log, line);
-                    error << '\n' << line;
+        try {
+            if (res != 0) {
+                // Load the error log.
+
+                stringstream error;
+                error << "Error launching HIP compiler: " << res;
+                ifstream log(logFile.c_str());
+                if (log.is_open()) {
+                    string line;
+                    while (!log.eof()) {
+                        getline(log, line);
+                        error << '\n' << line;
+                    }
+                    log.close();
                 }
-                log.close();
+                throw OpenMMException(error.str());
             }
-            throw OpenMMException(error.str());
-        }
-        hipError_t result = hipModuleLoad(&module, outputFile.c_str());
-        if (result != hipSuccess) {
-            std::stringstream m;
-            m<<"Error loading HIP module: "<<getErrorString(result)<<" ("<<result<<")";
-            throw OpenMMException(m.str());
-        }
-        if (!saveTemps) {
-            remove(inputFile.c_str());
-            remove(logFile.c_str());
+
+            vector<char> code;
+            ifstream out(outputFile.c_str(), ios::in | ios::binary);
+            if (!out.is_open()) {
+                std::stringstream error;
+                error << "Error reading HIP module from `" << outputFile << "`";
+                throw OpenMMException(error.str());
+            }
+            code.insert(code.begin(), istreambuf_iterator<char>(out), istreambuf_iterator<char>());
+            out.close();
+
+            if (!saveTemps) {
+                remove(inputFile.c_str());
+                remove(logFile.c_str());
+            }
+            if (rename(outputFile.c_str(), cacheFile.c_str()) != 0 && !saveTemps)
+                remove(outputFile.c_str());
+
+            CHECK_RESULT2(hipModuleLoadDataEx(&module, &code[0], 0, NULL, NULL), "Error loading HIP module");
+            loadedModules.push_back(module);
+            return module;
         }
-        if (rename(outputFile.c_str(), cacheFile.c_str()) != 0 && !saveTemps)
-            remove(outputFile.c_str());
-        loadedModules.push_back(module);
-        return module;
-    }
-    catch (...) {
-        if (!saveTemps) {
-            remove(inputFile.c_str());
-            remove(outputFile.c_str());
-            remove(logFile.c_str());
+        catch (...) {
+            if (!saveTemps) {
+                remove(inputFile.c_str());
+                remove(outputFile.c_str());
+                remove(logFile.c_str());
+            }
+            throw;
         }
-        throw;
     }
 }
 
@@ -708,7 +711,11 @@ ComputeEvent HipContext::createEvent() {
 
 HipFFTBase* HipContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex, hipStream_t stream, HipArray& in, HipArray& out) {
     if (fftBackend == 1) {
+#ifdef OPENMM_HIP_WITH_HIPFFT
         return new HipFFTImplHipFFT(*this, xsize, ysize, zsize, realToComplex, stream, in, out);
+#else
+        throw OpenMMException("OpenMM HIP is not built with hipFFT support");
+#endif
     }
     else if (fftBackend == 2) {
         return new HipFFTImplVkFFT(*this, xsize, ysize, zsize, realToComplex, stream, in, out);
@@ -720,7 +727,9 @@ HipFFTBase* HipContext::createFFT(int xsize, int ysize, int zsize, bool realToCo
 
 int HipContext::findLegalFFTDimension(int minimum) {
     if (fftBackend == 1) {
+#ifdef OPENMM_HIP_WITH_HIPFFT
         return HipFFTImplHipFFT::findLegalDimension(minimum);
+#endif
     }
     else if (fftBackend == 2) {
         return HipFFTImplVkFFT::findLegalDimension(minimum);
@@ -904,24 +913,20 @@ vector<int> HipContext::getDevicePrecedence() {
     int numDevices;
     hipDeviceProp_t thisDevice;
     string errorMessage = "Error initializing Context";
-    vector<pair<pair<int, int>, int> > devices;
+    vector<pair<int, int> > devices;
 
     CHECK_RESULT(hipGetDeviceCount(&numDevices));
     for (int i = 0; i < numDevices; i++) {
         CHECK_RESULT(hipGetDeviceProperties(&thisDevice, i));
         int clock, multiprocessors, speed;
         // AMD GPU
-        // gcn arch is available if needed, however...
-        int major = thisDevice.gcnArch;
         clock = thisDevice.clockRate;
         multiprocessors = thisDevice.multiProcessorCount;
         speed = clock*multiprocessors;
-        pair<int, int> deviceProperties = std::make_pair(major, speed);
-        devices.push_back(std::make_pair(deviceProperties, -i));
+        devices.push_back(std::make_pair(speed, -i));
     }
 
-    // sort first by compute capability (higher is better), then speed
-    // (higher is better), and finally device index (lower is better)
+    // sort first by speed (higher is better), and finally device index (lower is better)
     std::sort(devices.begin(), devices.end());
     std::reverse(devices.begin(), devices.end());
 
diff --git a/platforms/hip/src/HipFFTImplHipFFT.cpp b/platforms/hip/src/HipFFTImplHipFFT.cpp
index 6adebdf..ecf6f16 100644
--- a/platforms/hip/src/HipFFTImplHipFFT.cpp
+++ b/platforms/hip/src/HipFFTImplHipFFT.cpp
@@ -26,6 +26,8 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
  * -------------------------------------------------------------------------- */
 
+#ifdef OPENMM_HIP_WITH_HIPFFT
+
 #include "HipFFTImplHipFFT.h"
 #include "HipContext.h"
 
@@ -121,3 +123,5 @@ int HipFFTImplHipFFT::findLegalDimension(int minimum) {
         minimum++;
     }
 }
+
+#endif // OPENMM_HIP_WITH_HIPFFT
diff --git a/platforms/hip/src/HipKernelFactory.cpp b/platforms/hip/src/HipKernelFactory.cpp
index 9ef58a8..f75d1fe 100644
--- a/platforms/hip/src/HipKernelFactory.cpp
+++ b/platforms/hip/src/HipKernelFactory.cpp
@@ -6,7 +6,7 @@
  * Biological Structures at Stanford, funded under the NIH Roadmap for        *
  * Medical Research, grant U54 GM072970. See https://simtk.org.               *
  *                                                                            *
- * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2023 Stanford University and the Authors.      *
  * Portions copyright (C) 2020 Advanced Micro Devices, Inc. All Rights        *
  * Reserved.                                                                  *
  * Authors: Peter Eastman, Nicholas Curtis                                    *
@@ -113,6 +113,8 @@ KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform&
         return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
     if (name == CalcCustomCVForceKernel::Name())
         return new HipCalcCustomCVForceKernel(name, platform, cu);
+    if (name == CalcCustomCPPForceKernel::Name())
+        return new CommonCalcCustomCPPForceKernel(name, platform, context, cu);
     if (name == CalcRMSDForceKernel::Name())
         return new CommonCalcRMSDForceKernel(name, platform, cu);
     if (name == CalcCustomManyParticleForceKernel::Name())
@@ -141,5 +143,7 @@ KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform&
         return new CommonApplyMonteCarloBarostatKernel(name, platform, cu);
     if (name == RemoveCMMotionKernel::Name())
         return new CommonRemoveCMMotionKernel(name, platform, cu);
+    if (name == CalcATMForceKernel::Name() )
+        return new HipCalcATMForceKernel(name, platform, cu);
     throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
 }
diff --git a/platforms/hip/src/HipKernels.cpp b/platforms/hip/src/HipKernels.cpp
index fc816e4..df2cf69 100644
--- a/platforms/hip/src/HipKernels.cpp
+++ b/platforms/hip/src/HipKernels.cpp
@@ -1001,7 +1001,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
     }
     source = cu.replaceStrings(source, replacements);
     if (force.getIncludeDirectSpace())
-        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
+        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true);
 
     // Initialize the exceptions.
 
diff --git a/platforms/hip/src/HipNonbondedUtilities.cpp b/platforms/hip/src/HipNonbondedUtilities.cpp
index 5dd6fe0..4307c0d 100644
--- a/platforms/hip/src/HipNonbondedUtilities.cpp
+++ b/platforms/hip/src/HipNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
  * Biological Structures at Stanford, funded under the NIH Roadmap for        *
  * Medical Research, grant U54 GM072970. See https://simtk.org.               *
  *                                                                            *
- * Portions copyright (c) 2009-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
  * Portions copyright (C) 2020-2023 Advanced Micro Devices, Inc. All Rights   *
  * Reserved.                                                                  *
  * Authors: Peter Eastman, Nicholas Curtis                                    *
@@ -51,21 +51,18 @@ using namespace std;
 
 class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait {
 public:
-    BlockSortTrait(bool useDouble) : useDouble(useDouble) {
-    }
-    int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);}
-    int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);}
-    const char* getDataType() const {return "real2";}
-    const char* getKeyType() const {return "real";}
-    const char* getMinKey() const {return "-3.40282e+38f";}
-    const char* getMaxKey() const {return "3.40282e+38f";}
-    const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";}
-    const char* getSortKey() const {return "value.x";}
-private:
-    bool useDouble;
+    BlockSortTrait() {}
+    int getDataSize() const {return sizeof(int);}
+    int getKeySize() const {return sizeof(int);}
+    const char* getDataType() const {return "unsigned int";}
+    const char* getKeyType() const {return "unsigned int";}
+    const char* getMinKey() const {return "0";}
+    const char* getMaxKey() const {return "0xFFFFFFFFu";}
+    const char* getMaxValue() const {return "0xFFFFFFFFu";}
+    const char* getSortKey() const {return "value";}
 };
 
-HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
+HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
         blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
     // Decide how many thread blocks to use.
 
@@ -75,6 +72,13 @@ HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(cont
     numForceThreadBlocks = 5*4*context.getMultiprocessors();
     forceThreadBlockSize = 64;
     findInteractingBlocksThreadBlockSize = context.getSIMDWidth();
+
+    // When building the neighbor list, we can optionally use large blocks (32 * warpSize atoms) to
+    // accelerate the process.  This makes building the neighbor list faster, but it prevents
+    // us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
+    // list.  We guess based on system size which will be faster.
+
+    useLargeBlocks = (context.getNumAtoms() > 90000);
     setKernelSource(HipKernelSources::nonbonded);
 }
 
@@ -86,11 +90,11 @@ HipNonbondedUtilities::~HipNonbondedUtilities() {
     hipEventDestroy(downloadCountEvent);
 }
 
-void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
-    addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false);
+void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList) {
+    addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, usesNeighborList, false);
 }
 
-void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
+void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList) {
     if (groupCutoff.size() > 0) {
         if (usesCutoff != useCutoff)
             throw OpenMMException("All Forces must agree on whether to use a cutoff");
@@ -103,6 +107,7 @@ void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, b
         requestExclusions(exclusionList);
     useCutoff = usesCutoff;
     usePeriodic = usesPeriodic;
+    useNeighborList |= (usesNeighborList && useCutoff);
     groupCutoff[forceGroup] = cutoffDistance;
     groupFlags |= 1<<forceGroup;
     canUsePairList &= supportsPairList;
@@ -217,7 +222,7 @@ void HipNonbondedUtilities::initialize(const System& system) {
     vector<int2> exclusionTilesVec;
     for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
         exclusionTilesVec.push_back(make_int2(iter->first, iter->second));
-    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useCutoff ? compareInt2 : compareInt2LargeSIMD);
+    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useNeighborList ? compareInt2 : compareInt2LargeSIMD);
     exclusionTiles.initialize<int2>(context, exclusionTilesVec.size(), "exclusionTiles");
     exclusionTiles.upload(exclusionTilesVec);
     map<pair<int, int>, int> exclusionTileMap;
@@ -291,15 +296,23 @@ void HipNonbondedUtilities::initialize(const System& system) {
         int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
         blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
         blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
-        sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
+        sortedBlocks.initialize<unsigned int>(context, numAtomBlocks, "sortedBlocks");
         sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
         sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
+        blockSizeRange.initialize(context, 2, elementSize, "blockSizeRange");
+        largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
+        largeBlockBoundingBox.initialize(context, numAtomBlocks*4, elementSize, "largeBlockBoundingBox");
         oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
         rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
-        blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
+        blockSorter = new HipSort(context, new BlockSortTrait(), numAtomBlocks, false);
         vector<unsigned int> count(2, 0);
         interactionCount.upload(count);
         rebuildNeighborList.upload(&count[0]);
+        if (context.getUseDoublePrecision()) {
+            blockSizeRange.upload(vector<double>{1e38, 0});
+        } else {
+            blockSizeRange.upload(vector<float>{1e38, 0});
+        }
     }
 
     // Record arguments for kernels.
@@ -343,17 +356,30 @@ void HipNonbondedUtilities::initialize(const System& system) {
         findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
         findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
         findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
-        findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockSizeRange.getDevicePointer());
+        computeSortKeysArgs.push_back(&blockBoundingBox.getDevicePointer());
+        computeSortKeysArgs.push_back(&sortedBlocks.getDevicePointer());
+        computeSortKeysArgs.push_back(&blockSizeRange.getDevicePointer());
         sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
         sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
         sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
         sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
         sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        if (useLargeBlocks) {
+            sortBoxDataArgs.push_back(&largeBlockCenter.getDevicePointer());
+            sortBoxDataArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxSizePointer());
+            sortBoxDataArgs.push_back(context.getInvPeriodicBoxSizePointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecXPointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecYPointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecZPointer());
+        }
         sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
         sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
         sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
         sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
         sortBoxDataArgs.push_back(&forceRebuildNeighborList);
+        sortBoxDataArgs.push_back(&blockSizeRange.getDevicePointer());
         findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
         findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
         findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
@@ -371,6 +397,10 @@ void HipNonbondedUtilities::initialize(const System& system) {
         findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
         findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
         findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        if (useLargeBlocks) {
+            findInteractingBlocksArgs.push_back(&largeBlockCenter.getDevicePointer());
+            findInteractingBlocksArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
+        }
         findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
         findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
         findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
@@ -398,23 +428,24 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
         return;
     if (groupKernels.find(forceGroups) == groupKernels.end())
         createKernelsForGroups(forceGroups);
-    if (!useCutoff)
-        return;
-    if (numTiles == 0)
-        return;
     KernelSet& kernels = groupKernels[forceGroups];
-    if (usePeriodic) {
+    if (useCutoff && usePeriodic) {
         double4 box = context.getPeriodicBoxSize();
         double minAllowedSize = 1.999999*kernels.cutoffDistance;
         if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
             throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
     }
+    if (!useNeighborList)
+        return;
+    if (numTiles == 0)
+        return;
 
     // Compute the neighbor list.
 
     if (lastCutoff != kernels.cutoffDistance)
         forceRebuildNeighborList = true;
     context.executeKernelFlat(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getPaddedNumAtoms(), context.getSIMDWidth());
+    context.executeKernelFlat(kernels.computeSortKeysKernel, &computeSortKeysArgs[0], context.getNumAtomBlocks());
     blockSorter->sort(sortedBlocks);
     context.executeKernelFlat(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms(), 64);
     context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize);
@@ -434,7 +465,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor
             kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
         context.executeKernelFlat(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
     }
-    if (useCutoff && numTiles > 0) {
+    if (useNeighborList && numTiles > 0) {
         hipEventSynchronize(downloadCountEvent);
         updateNeighborListSize();
     }
@@ -523,6 +554,8 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
             defines["USE_PERIODIC"] = "1";
         if (context.getBoxIsTriclinic())
             defines["TRICLINIC"] = "1";
+        if (useLargeBlocks)
+            defines["USE_LARGE_BLOCKS"] = "1";
         defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
         int maxBits = 0;
         if (canUsePairList) {
@@ -551,8 +584,14 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
         defines["MAX_BITS_FOR_PAIRS"] = context.intToString(maxBits);
         defines["NUM_TILES_IN_BATCH"] = context.intToString(numTilesInBatch);
         defines["GROUP_SIZE"] = context.intToString(findInteractingBlocksThreadBlockSize);
+        int binShift = 1;
+        while (1<<binShift <= context.getNumAtomBlocks())
+            binShift++;
+        defines["BIN_SHIFT"] = context.intToString(binShift);
+        defines["BLOCK_INDEX_MASK"] = context.intToString((1<<binShift)-1);
         hipModule_t interactingBlocksProgram = context.createModule(HipKernelSources::vectorOps+HipKernelSources::findInteractingBlocks, defines);
         kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
+        kernels.computeSortKeysKernel = context.getKernel(interactingBlocksProgram, "computeSortKeys");
         kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
         kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
         kernels.copyInteractionCountsKernel = context.getKernel(interactingBlocksProgram, "copyInteractionCounts");
@@ -671,6 +710,8 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
         defines["USE_EXCLUSIONS"] = "1";
     if (isSymmetric)
         defines["USE_SYMMETRIC"] = "1";
+    if (useNeighborList)
+        defines["USE_NEIGHBOR_LIST"] = "1";
     defines["ENABLE_SHUFFLE"] = "1"; // Used only in hippoNonbonded.cc
     if (includeForces)
         defines["INCLUDE_FORCES"] = "1";
diff --git a/platforms/hip/src/HipPlatform.cpp b/platforms/hip/src/HipPlatform.cpp
index 4ac15a9..d3237df 100644
--- a/platforms/hip/src/HipPlatform.cpp
+++ b/platforms/hip/src/HipPlatform.cpp
@@ -93,6 +93,7 @@ HipPlatform::HipPlatform() {
     registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
     registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
     registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCPPForceKernel::Name(), factory);
     registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
     registerKernelFactory(CalcRMSDForceKernel::Name(), factory);
     registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
@@ -108,6 +109,7 @@ HipPlatform::HipPlatform() {
     registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
     registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
     registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
+    registerKernelFactory(CalcATMForceKernel::Name(), factory);
     platformProperties.push_back(HipDeviceIndex());
     platformProperties.push_back(HipDeviceName());
     platformProperties.push_back(HipUseBlockingSync());
@@ -133,16 +135,12 @@ HipPlatform::HipPlatform() {
         hipcc = compiler;
     }
     else if (rocmPath != NULL) {
-        hipcc = string(rocmPath) + "/bin/hipcc";
+        hipcc = string(rocmPath) + "/bin/amdclang++";
     }
     else {
-        hipcc = "/opt/rocm/bin/hipcc";
+        hipcc = "/opt/rocm/bin/amdclang++";
     }
     setPropertyDefaultValue(HipCompiler(), hipcc);
-    // Do not use hipRTC by default (it doesn't allow to use a workaround for DPP, see comments in
-    // HipContext::createModule and intrinsics.hip so performance may be a bit lower).
-    // HIP-TODO: Enable when the compiler issue is fixed.
-    // hipRTC on ROCm before 4.3 and earlier has issues so it's not supported
     char* useHipRtcEnv = getenv("OPENMM_USE_HIPRTC");
     bool allowRuntimeCompiler = (useHipRtcEnv != NULL && string(useHipRtcEnv) == "1");
     setPropertyDefaultValue(HipAllowRuntimeCompiler(), allowRuntimeCompiler ? "true" : "false");
diff --git a/platforms/hip/src/HipSort.cpp b/platforms/hip/src/HipSort.cpp
index d7bdbe1..5775fd4 100644
--- a/platforms/hip/src/HipSort.cpp
+++ b/platforms/hip/src/HipSort.cpp
@@ -6,7 +6,7 @@
  * Biological Structures at Stanford, funded under the NIH Roadmap for        *
  * Medical Research, grant U54 GM072970. See https://simtk.org.               *
  *                                                                            *
- * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2021 Stanford University and the Authors.      *
  * Portions copyright (C) 2020-2023 Advanced Micro Devices, Inc. All Rights   *
  * Reserved.                                                                  *
  * Authors: Peter Eastman, Nicholas Curtis                                    *
@@ -45,11 +45,12 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo
     replacements["MIN_KEY"] = trait->getMinKey();
     replacements["MAX_KEY"] = trait->getMaxKey();
     replacements["MAX_VALUE"] = trait->getMaxValue();
+    replacements["UNIFORM"] = (uniform ? "1" : "0");
     hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements));
     shortListKernel = context.getKernel(module, "sortShortList");
     shortList2Kernel = context.getKernel(module, "sortShortList2");
     computeRangeKernel = context.getKernel(module, "computeRange");
-    assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
+    assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
     computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
     copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
     sortBucketsKernel = context.getKernel(module, "sortBuckets");
@@ -59,7 +60,7 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo
     int maxSharedMem;
     hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
     int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
-    int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
+    int maxShortList = min(1024, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
     isShortList = (length <= maxShortList);
     sortKernelSize = 256;
     rangeKernelSize = 256;
diff --git a/platforms/hip/src/kernels/findInteractingBlocks.hip b/platforms/hip/src/kernels/findInteractingBlocks.hip
index 65db0f8..4a095ed 100644
--- a/platforms/hip/src/kernels/findInteractingBlocks.hip
+++ b/platforms/hip/src/kernels/findInteractingBlocks.hip
@@ -1,3 +1,5 @@
+#include <hip/hip_fp16.h>
+
 #define BUFFER_SIZE 256
 
 #if defined(AMD_RDNA)
@@ -18,12 +20,27 @@ __device__ inline int warpPopc(warpflags x) {
 
 #endif
 
+struct alignas(sizeof(__half) * 4) BoundingBox {
+    __device__ BoundingBox(real3 f) {
+        // Round up so we'll err on the side of making the box a little too large.
+        // This ensures interactions will never be missed.
+        v[0] = __float2half_ru((float) f.x);
+        v[1] = __float2half_ru((float) f.y);
+        v[2] = __float2half_ru((float) f.z);
+    }
+    __device__ real3 toReal3() const {
+        return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
+    }
+private:
+    __half v[3];
+};
+
 /**
  * Find a bounding box for the atoms in each block.
  */
 extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
         const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
-        real2* __restrict__ sortedBlocks) {
+        real2* __restrict__ blockSizeRange) {
     const int indexInTile = threadIdx.x%TILE_SIZE;
     const int index = warpSize == TILE_SIZE ? blockIdx.x : (blockIdx.x*(warpSize/TILE_SIZE) + threadIdx.x/TILE_SIZE);
     const int base = index * TILE_SIZE;
@@ -97,28 +114,77 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
         center.w = SQRT(tcenter);
         blockBoundingBox[index] = blockSize;
         blockCenter[index] = center;
-        // blockSize.x+blockSize.y+blockSize.z has a distibution that looks like a normal distribution.
-        // This causes HipSort's buckets to have very non-uniform sizes, so a few very long buckets are
-        // sorted in global memory. -1/max(x, y, z) or -1/(x+y+z) have a "faster" distribution.
-        sortedBlocks[index] = make_real2(-RECIP(max(max(blockSize.x, blockSize.y), blockSize.z)), index);
+
+        // Record the range of sizes.
+        real totalSize = blockSize.x+blockSize.y+blockSize.z;
+        atomicMin(&blockSizeRange->x, totalSize);
+        atomicMax(&blockSizeRange->y, totalSize);
     }
     if (blockIdx.x == 0 && threadIdx.x == 0)
         rebuildNeighborList[0] = 0;
 }
 
+extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, const real2* __restrict__ blockSizeRange/*, int numSizes*/) {
+    // Sort keys store the bin in the high order part and the block in the low
+    // order part.
+
+    real2 sizeRange = make_real2(LOG(blockSizeRange->x), LOG(blockSizeRange->y));
+    int numSizeBins = 20;
+    real scale = numSizeBins/(sizeRange.y-sizeRange.x);
+    unsigned int i = threadIdx.x+blockIdx.x*blockDim.x;
+    if (i < NUM_BLOCKS) {
+        real4 box = blockBoundingBox[i];
+        real size = LOG(box.x+box.y+box.z);
+        int bin = (size-sizeRange.x)*scale;
+        bin = max(0, min(bin, numSizeBins-1));
+        sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
+    }
+}
 
 /**
  * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
  */
-extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
-        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
-        real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
-        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
+extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
+        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, BoundingBox* __restrict__ sortedBlockBoundingBox,
+#ifdef USE_LARGE_BLOCKS
+        real4* __restrict__ largeBlockCenter, BoundingBox* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
+        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+#endif
+        const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
+        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild,
+        real2* __restrict__ blockSizeRange) {
     int i = threadIdx.x+blockIdx.x*blockDim.x;
     if (i < NUM_BLOCKS) {
-        int index = (int) sortedBlock[i].y;
+        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
         sortedBlockCenter[i] = blockCenter[index];
-        sortedBlockBoundingBox[i] = blockBoundingBox[index];
+        sortedBlockBoundingBox[i] = BoundingBox(trimTo3(blockBoundingBox[index]));
+
+#ifdef USE_LARGE_BLOCKS
+        // Compute the sizes of large blocks (composed of warpSize regular blocks) starting from each block.
+
+        real4 minPos = blockCenter[index]-blockBoundingBox[index];
+        real4 maxPos = blockCenter[index]+blockBoundingBox[index];
+        int last = min(i+warpSize, NUM_BLOCKS);
+        for (int j = i+1; j < last; j++) {
+            unsigned int index2 = sortedBlocks[j] & BLOCK_INDEX_MASK;
+            real4 blockPos = blockCenter[index2];
+            real4 width = blockBoundingBox[index2];
+#ifdef USE_PERIODIC
+            real4 center = 0.5f*(maxPos+minPos);
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
+#endif
+            minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
+            maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
+        }
+        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
+        largeBlockBoundingBox[i] = BoundingBox(trimTo3(0.5f*(maxPos-minPos)));
+#endif
+    }
+
+    // Initialize min and max for the atomic reduction in findBlockBounds (for the next step)
+    if (i == 0) {
+        blockSizeRange->x = 1e38;
+        blockSizeRange->y = 0;
     }
 
     // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
@@ -241,8 +307,12 @@ void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned i
 extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
         unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
         int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
-        unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
-        const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        unsigned int startBlockIndex, unsigned int numBlocks, const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
+        const BoundingBox* __restrict__ sortedBlockBoundingBox,
+#ifdef USE_LARGE_BLOCKS
+        const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox,
+#endif
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
         real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
 
     if (rebuildNeighborList[0] == 0)
@@ -273,10 +343,9 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
     if (block1 < startBlockIndex+numBlocks) {
         // Load data for this block.  Note that all threads in a warp are processing the same block.
 
-        real2 sortedKey = sortedBlocks[block1];
-        int x = (int) sortedKey.y;
+        int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
         real4 blockCenterX = sortedBlockCenter[block1];
-        real4 blockSizeX = sortedBlockBoundingBox[block1];
+        real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
         int neighborsInBuffer = 0;
         real4 pos1 = posq[x*TILE_SIZE+indexInTile];
     #ifdef USE_PERIODIC
@@ -311,10 +380,52 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
         // units are idle at the end of the kernel (the kernel works on the upper triangle of
         // the NUM_BLOCKS x NUM_BLOCKS matrix).
 
+#ifdef USE_LARGE_BLOCKS
+        warpflags largeBlockFlags = 0;
+        int loadedLargeBlocks = 0;
+#endif
         int block2Count = 0;
         // Load blocks from addresses aligned by warpSize for faster loading from sortedBlockCenter and sortedBlockBoundingBox.
         for (int block2Base = ((block1+1)/warpSize + warpIndex%NUM_TILES_IN_BATCH)*warpSize; block2Base < NUM_BLOCKS; block2Base += warpSize*NUM_TILES_IN_BATCH) {
+            // Last iteration cannot be skipped (on CDNA where tilesPerWarp == 2)
             const bool lastIteration = block2Base + warpSize*NUM_TILES_IN_BATCH >= NUM_BLOCKS;
+#ifdef USE_LARGE_BLOCKS
+            if (loadedLargeBlocks == 0) {
+                // Check the next set of large blocks.
+
+                int largeBlockIndex = block2Base + warpSize*NUM_TILES_IN_BATCH*indexInWarp;
+                bool includeLargeBlock = false;
+                if (largeBlockIndex < NUM_BLOCKS) {
+                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
+                    real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
+                    real4 blockDelta = blockCenterX-largeCenter;
+#ifdef USE_PERIODIC
+                    APPLY_PERIODIC_TO_DELTA(blockDelta)
+#endif
+                    blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
+                    blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
+                    blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
+                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
+#ifdef TRICLINIC
+                    // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
+                    // If there's any possibility we might have missed it, do a detailed check.
+
+                    if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF)
+                        includeLargeBlock = true;
+#endif
+                }
+                largeBlockFlags = __ballot(includeLargeBlock);
+                loadedLargeBlocks = warpSize;
+            }
+            loadedLargeBlocks--;
+            if ((largeBlockFlags&1) == 0 && !lastIteration) {
+                // None of the next warpSize blocks interact with block 1.
+
+                largeBlockFlags >>= 1;
+                continue;
+            }
+            largeBlockFlags >>= 1;
+#endif
             int block2 = block2Base+indexInWarp;
             bool includeBlock2 = (block1 < block2 && block2 < NUM_BLOCKS);
             block2 = includeBlock2 ? block2 : block1;
@@ -329,7 +440,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
             if (!lastIteration && __ballot(includeBlock2) == 0)
                 continue;
     #endif
-            real4 blockSizeY = sortedBlockBoundingBox[block2];
+            real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
             blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
             blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
             blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
@@ -361,7 +472,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
                 const int b = block2Buffer[min(block2Index + tileInWarp, block2Count - 1)];
                 const bool forceInclude = b & 1;
                 const int block2 = b >> 1;
-                int y = (int) sortedBlocks[block2].y;
+                int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
 
                 #pragma unroll 1
                 for (int k = indexInTile; k < numExclusions; k += TILE_SIZE)
diff --git a/platforms/hip/src/kernels/nonbonded.hip b/platforms/hip/src/kernels/nonbonded.hip
index d6e1716..9cbaae0 100644
--- a/platforms/hip/src/kernels/nonbonded.hip
+++ b/platforms/hip/src/kernels/nonbonded.hip
@@ -237,7 +237,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
     // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
     // of them (no cutoff).
 
-#ifdef USE_CUTOFF
+#ifdef USE_NEIGHBOR_LIST
     const unsigned int numTiles = interactionCount[0];
     if (numTiles > maxTiles)
         return; // There wasn't enough memory for the neighbor list.
@@ -262,7 +262,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
         // Extract the coordinates of this tile.
         int x, y;
         bool singlePeriodicCopy = false;
-#ifdef USE_CUTOFF
+#ifdef USE_NEIGHBOR_LIST
         x = tiles[pos];
         real4 blockSizeX = blockSize[x];
         singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
@@ -297,7 +297,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
             // Load atom data for this tile.
             real4 posq1 = posq[atom1];
             LOAD_ATOM1_PARAMETERS
-#ifdef USE_CUTOFF
+#ifdef USE_NEIGHBOR_LIST
             unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
             unsigned int j = y*TILE_SIZE + tgx;
@@ -454,7 +454,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
 
     // Third loop: single pairs that aren't part of a tile.
 
-#if USE_CUTOFF
+#if USE_NEIGHBOR_LIST
     const unsigned int numPairs = interactionCount[1];
     if (numPairs > maxSinglePairs)
         return; // There wasn't enough memory for the neighbor list.
diff --git a/platforms/hip/src/kernels/sort.hip b/platforms/hip/src/kernels/sort.hip
index 288c629..3efe82d 100644
--- a/platforms/hip/src/kernels/sort.hip
+++ b/platforms/hip/src/kernels/sort.hip
@@ -95,6 +95,7 @@ inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE
  */
 __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, volatile KEY_TYPE* __restrict__ range,
         unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
+#if UNIFORM
     extern __shared__ KEY_TYPE minBuffer[];
     KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
     KEY_TYPE minimum = MAX_KEY;
@@ -131,6 +132,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
         }
         reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]);
     }
+#endif
 
     // Clear the bucket counters in preparation for the next kernel.
 
@@ -139,7 +141,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
 }
 
 /**
- * Assign elements to buckets.
+ * Assign elements to buckets.  This version is optimized for uniformly distributed data.
  */
 __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
         unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
@@ -154,6 +156,86 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi
     }
 }
 
+
+/**
+ * Assign elements to buckets.  This version is optimized for non-uniformly distributed data.
+ */
+__global__ void assignElementsToBuckets2(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
+        unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
+    // Load 64 datapoints and sort them to get an estimate of the data distribution.
+
+    __shared__ KEY_TYPE elements[64];
+    if (threadIdx.x < 64) {
+        int index = (int) (threadIdx.x*length/64.0);
+        elements[threadIdx.x] = getValue(data[index]);
+    }
+    __syncthreads();
+    for (unsigned int k = 2; k <= 64; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            if (threadIdx.x < 64) {
+                int ixj = threadIdx.x^j;
+                if (ixj > threadIdx.x) {
+                    KEY_TYPE value1 = elements[threadIdx.x];
+                    KEY_TYPE value2 = elements[ixj];
+                    bool ascending = (threadIdx.x&k) == 0;
+                    KEY_TYPE lowKey = (ascending ? value1 : value2);
+                    KEY_TYPE highKey = (ascending ? value2 : value1);
+                    if (lowKey > highKey) {
+                        elements[threadIdx.x] = value2;
+                        elements[ixj] = value1;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Create a function composed of linear segments mapping data values to bucket indices.
+
+    __shared__ float segmentLowerBound[9];
+    __shared__ float segmentBaseIndex[9];
+    __shared__ float segmentIndexScale[9];
+    if (threadIdx.x == 0) {
+        segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]);
+        segmentLowerBound[1] = elements[5];
+        segmentLowerBound[2] = elements[10];
+        segmentLowerBound[3] = elements[20];
+        segmentLowerBound[4] = elements[30];
+        segmentLowerBound[5] = elements[40];
+        segmentLowerBound[6] = elements[50];
+        segmentLowerBound[7] = elements[60];
+        segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]);
+        segmentBaseIndex[0] = numBuckets/16;
+        segmentBaseIndex[1] = 3*numBuckets/16;
+        segmentBaseIndex[2] = 5*numBuckets/16;
+        segmentBaseIndex[3] = 7*numBuckets/16;
+        segmentBaseIndex[4] = 9*numBuckets/16;
+        segmentBaseIndex[5] = 11*numBuckets/16;
+        segmentBaseIndex[6] = 13*numBuckets/16;
+        segmentBaseIndex[7] = 15*numBuckets/16;
+        segmentBaseIndex[8] = numBuckets;
+        for (int i = 0; i < 8; i++)
+            if (segmentLowerBound[i+1] == segmentLowerBound[i])
+                segmentIndexScale[i] = 0;
+            else
+                segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]);
+    }
+    __syncthreads();
+
+    // Assign elements to buckets.
+
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        float key = (float) getValue(data[index]);
+        int segment;
+        for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++)
+            ;
+        unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment];
+        bucketIndex = min(max(0, bucketIndex), numBuckets-1);
+        offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
+        bucketOfElement[index] = bucketIndex;
+    }
+}
+
 /**
  * Sum the bucket sizes to compute the start position of each bucket.  This kernel
  * is executed as a single work group.
diff --git a/platforms/hip/tests/TestHipATMForce.cpp b/platforms/hip/tests/TestHipATMForce.cpp
new file mode 100644
index 0000000..6a594fc
--- /dev/null
+++ b/platforms/hip/tests/TestHipATMForce.cpp
@@ -0,0 +1,36 @@
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2023 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestATMForce.h"
+
+void runPlatformTests() {
+}
diff --git a/platforms/hip/tests/TestHipCustomCPPForce.cpp b/platforms/hip/tests/TestHipCustomCPPForce.cpp
new file mode 100644
index 0000000..7910447
--- /dev/null
+++ b/platforms/hip/tests/TestHipCustomCPPForce.cpp
@@ -0,0 +1,36 @@
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2023 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomCPPForce.h"
+
+void runPlatformTests() {
+}
diff --git a/platforms/hip/tests/TestHipFFTImplHipFFT.cpp b/platforms/hip/tests/TestHipFFTImplHipFFT.cpp
index 9753761..f404a3f 100644
--- a/platforms/hip/tests/TestHipFFTImplHipFFT.cpp
+++ b/platforms/hip/tests/TestHipFFTImplHipFFT.cpp
@@ -35,6 +35,8 @@
  * This tests the hipFFT-based implementation of FFT.
  */
 
+#ifdef OPENMM_HIP_WITH_HIPFFT
+
 #include "openmm/internal/AssertionUtilities.h"
 #include "HipArray.h"
 #include "HipContext.h"
@@ -169,3 +171,11 @@ int main(int argc, char* argv[]) {
     cout << "Done" << endl;
     return 0;
 }
+
+#else // OPENMM_HIP_WITH_HIPFFT
+
+int main(int argc, char* argv[]) {
+    return 0;
+}
+
+#endif // OPENMM_HIP_WITH_HIPFFT