From 63090d6f1b1810a9e0f0d00ca8573317a7363bbb Mon Sep 17 00:00:00 2001
From: Griffin Bassman <griffinbassman@gmail.com>
Date: Tue, 6 Jun 2023 18:38:31 -0400
Subject: [PATCH] feat: flag to save and load per model state (#4605)

* feat: flag to save and load per model state

* clang

* fix nullptr

* const

* keep

* help

* quotest

* spacing

* check path

* vcpkg ref

* vcpkg full ref

* update checkout action

* conditional

* fix quotes

* show path

* recursive

* show vcpkg after checkout

* remove cache for vcpkg root

* remove caching

* again

* new env var

* redo

* rename vcpkg root
---
 test/train-sets/ref/help.stdout               |  2 +
 test/train-sets/ref/help_cbadf.stdout         |  3 ++
 .../include/vw/core/reductions/cb/cb_adf.h    |  5 ++-
 .../core/include/vw/core/reductions/gd.h      |  1 +
 vowpalwabbit/core/src/reductions/cb/cb_adf.cc | 39 ++++++++++++----
 vowpalwabbit/core/src/reductions/gd.cc        | 44 ++++++++++++++++---
 6 files changed, 78 insertions(+), 16 deletions(-)
diff --git a/test/train-sets/ref/help.stdout b/test/train-sets/ref/help.stdout
index 2ba787ebf0e..b9d4fca2f7b 100644
--- a/test/train-sets/ref/help.stdout
+++ b/test/train-sets/ref/help.stdout
@@ -461,6 +461,7 @@ Weight Options:
                                             (type: float, default: 0, keep)
     --cb_type arg                           Contextual bandit method to use (type: str, default: mtr, choices
                                             {dm, dr, ips, mtr, sm}, keep)
+    --per_model_save_load                   Save and load per model state (type: bool, keep)
 [Reduction] Contextual Bandit: cb -> cb_adf Options:
     --cb_to_cbadf arg                       Flag is unused and has no effect. It should not be passed. The
                                             cb_to_cbadf reduction is automatically enabled if cb, cb_explore
@@ -705,6 +706,7 @@ Weight Options:
                                             default: 0)
     --l2_state arg                          Amount of accumulated implicit l2 regularization (type: float,
                                             default: 1)
+    --per_model_save_load                   Save and load per model state (type: bool, keep)
 [Reduction] Interact via Elementwise Multiplication Options:
     --interact arg                          Put weights on feature products from namespaces <n1> and <n2>
                                             (type: str, keep, necessary)
diff --git a/test/train-sets/ref/help_cbadf.stdout b/test/train-sets/ref/help_cbadf.stdout
index 1c9ba5d15d2..7b047559ef5 100644
--- a/test/train-sets/ref/help_cbadf.stdout
+++ b/test/train-sets/ref/help_cbadf.stdout
@@ -225,6 +225,7 @@ Weight Options:
                                             (type: float, default: 0, keep)
     --cb_type arg                           Contextual bandit method to use (type: str, default: mtr, choices
                                             {dm, dr, ips, mtr, sm}, keep)
+    --per_model_save_load                   Save and load per model state (type: bool, keep)
 [Reduction] Contextual Bandit with Action Dependent Features Options:
     --cb_adf                                Do Contextual Bandit learning with multiline action dependent
                                             features (type: bool, keep, necessary)
@@ -234,6 +235,7 @@ Weight Options:
                                             (type: float, default: 0, keep)
     --cb_type arg                           Contextual bandit method to use (type: str, default: mtr, choices
                                             {dm, dr, ips, mtr, sm}, keep)
+    --per_model_save_load                   Save and load per model state (type: bool, keep)
 [Reduction] Contextual Bandit: cb -> cb_adf Options:
     --cb_to_cbadf arg                       Flag is unused and has no effect. It should not be passed. The
                                             cb_to_cbadf reduction is automatically enabled if cb, cb_explore
@@ -273,6 +275,7 @@ Weight Options:
                                             default: 0)
     --l2_state arg                          Amount of accumulated implicit l2 regularization (type: float,
                                             default: 1)
+    --per_model_save_load                   Save and load per model state (type: bool, keep)
 [Reduction] Scorer Options:
     --link arg                              Specify the link function (type: str, default: identity, choices
                                             {glf1, identity, logistic, poisson}, keep)
diff --git a/vowpalwabbit/core/include/vw/core/reductions/cb/cb_adf.h b/vowpalwabbit/core/include/vw/core/reductions/cb/cb_adf.h
index 0b7abb9aee0..7dc3890af8e 100644
--- a/vowpalwabbit/core/include/vw/core/reductions/cb/cb_adf.h
+++ b/vowpalwabbit/core/include/vw/core/reductions/cb/cb_adf.h
@@ -27,12 +27,13 @@ class cb_adf
   bool update_statistics(const VW::example& ec, const VW::multi_ex& ec_seq, VW::shared_data& sd) const;
 
   cb_adf(VW::cb_type_t cb_type, bool rank_all, float clip_p, bool no_predict, size_t feature_width_above,
-      VW::workspace* all)
+      bool per_model_save_load, VW::workspace* all)
       : _no_predict(no_predict)
       , _rank_all(rank_all)
       , _clip_p(clip_p)
       , _gen_cs_mtr(feature_width_above)
       , _cb_type(cb_type)
+      , _per_model_save_load(per_model_save_load)
       , _all(all)
   {
   }
@@ -57,6 +58,7 @@ class cb_adf
 
   VW::cb_class* known_cost() { return &_gen_cs_dr.known_cost; }
   const VW::cb_class* known_cost() const { return &_gen_cs_dr.known_cost; }
+  bool per_model_save_load() const { return _per_model_save_load; }
 
 private:
   void learn_ips(VW::LEARNER::learner& base, VW::multi_ex& examples);
@@ -84,6 +86,7 @@ class cb_adf
   VW::details::cb_to_cs_adf_dr _gen_cs_dr;
   VW::details::cb_to_cs_adf_mtr _gen_cs_mtr;
   VW::cb_type_t _cb_type;
+  bool _per_model_save_load;
 
   VW::workspace* _all = nullptr;
 };
diff --git a/vowpalwabbit/core/include/vw/core/reductions/gd.h b/vowpalwabbit/core/include/vw/core/reductions/gd.h
index 254cdcbb960..440cb3cc39c 100644
--- a/vowpalwabbit/core/include/vw/core/reductions/gd.h
+++ b/vowpalwabbit/core/include/vw/core/reductions/gd.h
@@ -55,6 +55,7 @@ class gd
   bool adaptive_input = false;
   bool normalized_input = false;
   bool adax = false;
+  bool per_model_save_load = false;
   VW::workspace* all = nullptr;  // parallel, features, parameters
 };
 }  // namespace reductions
diff --git a/vowpalwabbit/core/src/reductions/cb/cb_adf.cc b/vowpalwabbit/core/src/reductions/cb/cb_adf.cc
index 8006c31a7d7..4637daa0157 100644
--- a/vowpalwabbit/core/src/reductions/cb/cb_adf.cc
+++ b/vowpalwabbit/core/src/reductions/cb/cb_adf.cc
@@ -354,13 +354,29 @@ void save_load(VW::reductions::cb_adf& c, VW::io_buf& model_file, bool read, boo
   }
 
   std::stringstream msg;
-  msg << "event_sum " << c.get_gen_cs_mtr().per_model_state[0].event_sum << "\n";
-  VW::details::bin_text_read_write_fixed(model_file, (char*)&c.get_gen_cs_mtr().per_model_state[0].event_sum,
-      sizeof(c.get_gen_cs_mtr().per_model_state[0].event_sum), read, msg, text);
+  if (c.per_model_save_load())
+  {
+    for (size_t ind = 0; ind < c.get_gen_cs_mtr().per_model_state.size(); ++ind)
+    {
+      msg << "event_sum_" << c.get_gen_cs_mtr().per_model_state[ind].event_sum << "\n";
+      VW::details::bin_text_read_write_fixed(model_file, (char*)&c.get_gen_cs_mtr().per_model_state[ind].event_sum,
+          sizeof(c.get_gen_cs_mtr().per_model_state[ind].event_sum), read, msg, text);
+
+      msg << "action_sum " << c.get_gen_cs_mtr().per_model_state[ind].action_sum << "\n";
+      VW::details::bin_text_read_write_fixed(model_file, (char*)&c.get_gen_cs_mtr().per_model_state[ind].action_sum,
+          sizeof(c.get_gen_cs_mtr().per_model_state[ind].action_sum), read, msg, text);
+    }
+  }
+  else
+  {
+    msg << "event_sum " << c.get_gen_cs_mtr().per_model_state[0].event_sum << "\n";
+    VW::details::bin_text_read_write_fixed(model_file, (char*)&c.get_gen_cs_mtr().per_model_state[0].event_sum,
+        sizeof(c.get_gen_cs_mtr().per_model_state[0].event_sum), read, msg, text);
 
-  msg << "action_sum " << c.get_gen_cs_mtr().per_model_state[0].action_sum << "\n";
-  VW::details::bin_text_read_write_fixed(model_file, (char*)&c.get_gen_cs_mtr().per_model_state[0].action_sum,
-      sizeof(c.get_gen_cs_mtr().per_model_state[0].action_sum), read, msg, text);
+    msg << "action_sum " << c.get_gen_cs_mtr().per_model_state[0].action_sum << "\n";
+    VW::details::bin_text_read_write_fixed(model_file, (char*)&c.get_gen_cs_mtr().per_model_state[0].action_sum,
+        sizeof(c.get_gen_cs_mtr().per_model_state[0].action_sum), read, msg, text);
+  }
 }
 
 void cb_adf_merge(const std::vector<float>& /* per_model_weights */,
@@ -419,6 +435,7 @@ std::shared_ptr<VW::LEARNER::learner> VW::reductions::cb_adf_setup(VW::setup_bas
   bool rank_all;
   float clip_p;
   bool no_predict = false;
+  bool per_model_save_load = false;
 
   option_group_definition new_options("[Reduction] Contextual Bandit with Action Dependent Features");
   new_options
@@ -436,7 +453,12 @@ std::shared_ptr<VW::LEARNER::learner> VW::reductions::cb_adf_setup(VW::setup_bas
                .keep()
                .default_value("mtr")
                .one_of({"ips", "dm", "dr", "mtr", "sm"})
-               .help("Contextual bandit method to use"));
+               .help("Contextual bandit method to use"))
+      .add(make_option("per_model_save_load", per_model_save_load)
+               .keep()
+               .allow_override()
+               .help("Save and load per model state"));
+  ;
 
   if (!options.add_parse_and_check_necessary(new_options)) { return nullptr; }
 
@@ -490,7 +512,8 @@ std::shared_ptr<VW::LEARNER::learner> VW::reductions::cb_adf_setup(VW::setup_bas
 
   if (options.was_supplied("baseline") && check_baseline_enabled) { options.insert("check_enabled", ""); }
 
-  auto ld = VW::make_unique<VW::reductions::cb_adf>(cb_type, rank_all, clip_p, no_predict, feature_width_above, &all);
+  auto ld = VW::make_unique<VW::reductions::cb_adf>(
+      cb_type, rank_all, clip_p, no_predict, feature_width_above, per_model_save_load, &all);
 
   auto base = require_multiline(stack_builder.setup_base_learner(feature_width));
 
diff --git a/vowpalwabbit/core/src/reductions/gd.cc b/vowpalwabbit/core/src/reductions/gd.cc
index 4ff434cecd4..80a8673ee98 100644
--- a/vowpalwabbit/core/src/reductions/gd.cc
+++ b/vowpalwabbit/core/src/reductions/gd.cc
@@ -1115,9 +1115,21 @@ void VW::details::save_load_online_state_gd(VW::workspace& all, VW::io_buf& mode
       sizeof(all.update_rule_config.initial_t), read, msg, text);
 
   assert(pms.size() >= 1);
-  msg << "norm normalizer " << pms[0].normalized_sum_norm_x << "\n";
-  VW::details::bin_text_read_write_fixed(model_file, reinterpret_cast<char*>(&pms[0].normalized_sum_norm_x),
-      sizeof(pms[0].normalized_sum_norm_x), read, msg, text);
+  if (g != nullptr && g->per_model_save_load)
+  {
+    for (size_t ind = 0; ind < pms.size(); ++ind)
+    {
+      msg << "norm normalizer_" << ind << " " << pms[ind].normalized_sum_norm_x << "\n";
+      VW::details::bin_text_read_write_fixed(model_file, reinterpret_cast<char*>(&pms[ind].normalized_sum_norm_x),
+          sizeof(pms[ind].normalized_sum_norm_x), read, msg, text);
+    }
+  }
+  else
+  {
+    msg << "norm normalizer " << pms[0].normalized_sum_norm_x << "\n";
+    VW::details::bin_text_read_write_fixed(model_file, reinterpret_cast<char*>(&pms[0].normalized_sum_norm_x),
+        sizeof(pms[0].normalized_sum_norm_x), read, msg, text);
+  }
 
   msg << "t " << all.sd->t << "\n";
   VW::details::bin_text_read_write_fixed(
@@ -1174,9 +1186,21 @@ void VW::details::save_load_online_state_gd(VW::workspace& all, VW::io_buf& mode
     // restore some data to allow save_resume work more accurate
 
     // fix average loss
-    msg << "total_weight " << pms[0].total_weight << "\n";
-    VW::details::bin_text_read_write_fixed(
-        model_file, reinterpret_cast<char*>(&pms[0].total_weight), sizeof(pms[0].total_weight), read, msg, text);
+    if (g != nullptr && g->per_model_save_load)
+    {
+      for (size_t ind = 0; ind < pms.size(); ++ind)
+      {
+        msg << "total_weight_" << ind << " " << pms[ind].total_weight << "\n";
+        VW::details::bin_text_read_write_fixed(model_file, reinterpret_cast<char*>(&pms[ind].total_weight),
+            sizeof(pms[ind].total_weight), read, msg, text);
+      }
+    }
+    else
+    {
+      msg << "total_weight " << pms[0].total_weight << "\n";
+      VW::details::bin_text_read_write_fixed(
+          model_file, reinterpret_cast<char*>(&pms[0].total_weight), sizeof(pms[0].total_weight), read, msg, text);
+    }
 
     // fix "loss since last" for first printed out example details
     msg << "sd::oec.weighted_labeled_examples " << all.sd->old_weighted_labeled_examples << "\n";
@@ -1435,6 +1459,7 @@ std::shared_ptr<VW::LEARNER::learner> VW::reductions::gd_setup(VW::setup_base_i&
   all.sd->contraction = L2_STATE_DEFAULT;
   float local_gravity = 0;
   float local_contraction = 0;
+  bool per_model_save_load = false;
 
   option_group_definition new_options("[Reduction] Gradient Descent");
   new_options
@@ -1461,7 +1486,11 @@ std::shared_ptr<VW::LEARNER::learner> VW::reductions::gd_setup(VW::setup_base_i&
       .add(make_option("l2_state", local_contraction)
                .allow_override()
                .default_value(L2_STATE_DEFAULT)
-               .help("Amount of accumulated implicit l2 regularization"));
+               .help("Amount of accumulated implicit l2 regularization"))
+      .add(make_option("per_model_save_load", per_model_save_load)
+               .keep()
+               .allow_override()
+               .help("Save and load per model state"));
   options.add_and_parse(new_options);
 
   if (options.was_supplied("l1_state")) { all.sd->gravity = local_gravity; }
@@ -1475,6 +1504,7 @@ std::shared_ptr<VW::LEARNER::learner> VW::reductions::gd_setup(VW::setup_base_i&
   g->neg_norm_power = (all.weights.adaptive ? (all.update_rule_config.power_t - 1.f) : -1.f);
   g->neg_power_t = -all.update_rule_config.power_t;
   g->sparse_l2 = sparse_l2;
+  g->per_model_save_load = per_model_save_load;
 
   if (all.update_rule_config.initial_t >
       0)  // for the normalized update: if initial_t is bigger than 1 we interpret this as if we had