Halve model loading time for llama demo on iOS (#4032)

Summary: Pull Request resolved: #4032 mmap is not recommended for large sequential workloads -- you have to take a bunch of page faults. I originally assumed this would hurt peak memory usage (we read all the weights into memory at once and then pack them; packing is basically copying them), but it doesn't. In retrospect, this makes sense because we actually operate on one weights tensor at a time, and the individual tensors aren't gigantic, there are just a lot of them. Reviewed By: larryliu0820 Differential Revision: D58826044 fbshipit-source-id: 1a4e8a3522d5e24b56687bebd1f0f92e5514aff2
pytorch · Jul 2, 2024 · 8740c69 · 8740c69
1 parent 01a1b28
commit 8740c69
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 4 deletions.
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -15,6 +15,7 @@
 #else /* BPE */
 #include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #endif /* ET_USE_TIKTOKEN*/
+#include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
@@ -42,9 +43,7 @@ Runner::Runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
     const float temperature)
-    : module_(std::make_unique<Module>(
-          model_path,
-          Module::MlockConfig::UseMlockIgnoreErrors)),
+    : model_path_(model_path),
       tokenizer_path_(tokenizer_path),
       temperature_(temperature) {
   ET_LOG(
@@ -55,13 +54,22 @@ Runner::Runner(
 }
 
 bool Runner::is_loaded() const {
-  return module_->is_loaded() && tokenizer_ && sampler_;
+  return module_ && module_->is_loaded() && tokenizer_ && sampler_;
 }
 
 Error Runner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
+  // NOTE: we observed ~2x loading performance increase on iPhone 15
+  // and a ~5% improvement on Galaxy S22 by switching to
+  // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
+  auto data_loader_result = util::FileDataLoader::from(model_path_.c_str());
+  if (!data_loader_result.ok()) {
+    return data_loader_result.error();
+  }
+  module_ = std::make_unique<Module>(
+      std::make_unique<util::FileDataLoader>(std::move(*data_loader_result)));
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
 
   // Read out metadata: vocab_size (expected by the model), BOS, EOS, n_BOS,

diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -92,6 +92,7 @@ class Runner {
   bool use_sdpa_with_kv_cache_;
   bool append_eos_;
   std::unordered_set<std::string> model_methods_;
+  std::string model_path_;
   std::unique_ptr<Module> module_;
   std::string tokenizer_path_;
   float temperature_;

diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
@@ -31,6 +31,9 @@ def define_common_targets():
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
+            deps = [
+                "//executorch/extension/data_loader:file_data_loader",
+            ],
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
                 "//executorch/examples/models/llama2/sampler:sampler" + aten_suffix,