pytorch · kirklandsign · Mar 29, 2024
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
@@ -66,7 +66,19 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH})
   target_link_options_shared_lib(custom_ops_lib)
 
-  add_library(executorch_llama_jni SHARED jni/jni_layer_llama.cpp)
+  if(TARGET pthreadpool)
+    set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp ../../backends/xnnpack/threadpool/cpuinfo_utils.cpp)
+  else()
+    set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp)
+  endif()
+  add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS})
+  if(TARGET pthreadpool)
+    target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1)
+    target_include_directories(executorch_llama_jni PUBLIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include)
+    target_include_directories(executorch_llama_jni PUBLIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include)
+  endif()
   target_include_directories(executorch_llama_jni PRIVATE ${_common_include_directories})
   target_link_libraries(executorch_llama_jni ${link_libraries} llama_runner
                         custom_ops custom_ops_lib cpublas eigen_blas)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
@@ -20,6 +20,11 @@
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
 
+#if defined(ET_USE_THREADPOOL)
+#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
+#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#endif
+
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
@@ -91,6 +96,17 @@ class ExecuTorchLlamaJni
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature) {
+#if defined(ET_USE_THREADPOOL)
+    // Reserve 1 thread for the main thread.
+    uint32_t num_performant_cores =
+        torch::executorch::cpuinfo::get_num_performant_cores() - 1;
+    if (num_performant_cores > 0) {
+      ET_LOG(Info, "Resetting threadpool to %d threads", num_performant_cores);
+      torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool(
+          num_performant_cores);
+    }
+#endif
+
     runner_ = std::make_unique<Runner>(
         model_path->toStdString().c_str(),
         tokenizer_path->toStdString().c_str(),