From 256b78faa78f447ba9b4c4623e5ec1fac4d6a121 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Sun, 6 Oct 2024 17:52:40 -0700
Subject: [PATCH] Fix typos and formatting errors (#18)

Co-authored-by: Nathan Lambert <nathan@huggingface.co>
---
 chapters/01-introduction.md   | 15 ++++++++++-----
 chapters/07-reward-models.md  | 17 ++++++++++++++++-
 chapters/08-regularization.md | 12 ++++++------
 chapters/bib.bib              | 27 +++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/chapters/01-introduction.md b/chapters/01-introduction.md
index ca09364..3eccde3 100644
--- a/chapters/01-introduction.md
+++ b/chapters/01-introduction.md
@@ -23,30 +23,35 @@ It will not cover all the history of the components nor recent research methods,
 
 This book has the following chapters following this Introduction:
 
-**Introductions**
+**Introductions**:
+
 1. Introduction
 2. What are preferences?: The philosophy and social sciences behind RLHF.
 3. Optimization and RL: The problem formulation of RLHF.
 4. Seminal (Recent) Works: The core works leading to and following ChatGPT.
 
-**Problem Setup**
+**Problem Setup**:
+
 1. Definitions: Mathematical reference.
 2. Preference Data: Gathering human data of preferences.
 3. Reward Modeling: Modeling human preferences for environment signal.
 4. Regularization: Numerical tricks to stabilize and guide optimization.
 
-**Optimization**
+**Optimization**:
+
 1. Instruction Tuning: Fine-tuning models to follow instructions.
 2. Rejection Sampling: Basic method for using a reward model to filter data.
 3. Policy Gradients: Core RL methods used to perform RLHF.
 4. Direct Alignment Algorithms: New PreFT algorithms that do not need RL.
 
-**Advanced (TBD)**
+**Advanced (TBD)**:
+
 1. Constitutional AI
 2. Synthetic Data
 3. Evaluation
 
-**Open Questions (TBD)**
+**Open Questions (TBD)**:
+
 1. Over-optimization
 2. Style
 
diff --git a/chapters/07-reward-models.md b/chapters/07-reward-models.md
index d28a5a2..a8bebb9 100644
--- a/chapters/07-reward-models.md
+++ b/chapters/07-reward-models.md
@@ -26,4 +26,19 @@ rewards_chosen = model(**inputs_chosen)
 rewards_rejected = model(**inputs_rejected)
 
 loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
-```
\ No newline at end of file
+```
+
+### Further Reading
+
+reward modeling reading list imo
+
+RewardBench (biased, but gives a good overview): https://arxiv.org/abs/2403.13787
+ArmorRM: https://arxiv.org/abs/2406.12845
+HelpSteer2: https://arxiv.org/html/2406.08673v1
+HelpSteer2-Preference: https://arxiv.org/abs/2410.01257
+Nemotron 340: https://arxiv.org/abs/2406.11704
+Llama 2: https://arxiv.org/abs/2307.09288
+Interconnects 1: https://www.interconnects.ai/p/why-reward-models-matter
+Interconnects 2: https://www.interconnects.ai/p/open-rlhf-reward-models
+The o.g. paper: https://arxiv.org/abs/1811.07871
+Critique out loud RMs: https://arxiv.org/abs/2408.11791
\ No newline at end of file
diff --git a/chapters/08-regularization.md b/chapters/08-regularization.md
index a8527df..d70e423 100644
--- a/chapters/08-regularization.md
+++ b/chapters/08-regularization.md
@@ -11,7 +11,7 @@ Still, it is important to understand tools to constrain optimization in RLHF.
 
 The general formulation, when used in an RLHF framework with a reward model, $r_\theta$ is as follows:
 
-$$ r = r_\theta - \lambda r_{\text{reg.}} $$ {eq:rl_start}
+$$ r = r_\theta - \lambda r_{\text{reg.}} $$ {#eq:rl_start}
 
 With the reference implementation being:
 
@@ -61,11 +61,11 @@ ref_logprobs = convert_to_logpbs(ref_logits)
 kl_approx = logprob - ref_logprob
 kl_full = F.kl_div(ref_logprob, logprob) # alternate computation
 ```
-Some example implementations include [TRL](https://github.com/huggingface/trl/blob/5c21de30ae210e4251ead85517ba8dfe3f210e81/trl/trainer/ppo_trainer.py#L1150) and [Hamish Ivison's Jax Code]https://github.com/hamishivi/EasyLM/blob/main/EasyLM/models/llama/llama_train_ppo.py#L278)
+Some example implementations include [TRL](https://github.com/huggingface/trl/blob/5c21de30ae210e4251ead85517ba8dfe3f210e81/trl/trainer/ppo_trainer.py#L1150) and [Hamish Ivison's Jax Code](https://github.com/hamishivi/EasyLM/blob/main/EasyLM/models/llama/llama_train_ppo.py#L278)
 
 ## Pretraining Gradients
 
-Another way of viewing regularization is that you may have a *dataset* that you want the model to remain close to, as done in InstructGPT [@ouyang2022training] ``in order to fix the
+Another way of viewing regularization is that you may have a *dataset* that you want the model to remain close to, as done in InstructGPT [@ouyang2022training] ''in order to fix the
 performance regressions on public NLP datasets''.
 To implement this, they modify the training objective for RLHF.
 Taking @eq:rl_start, we can transform this into an objective function to optimize by sampling from the RL policy model, completions $y$ from prompts $x$, which yields:
@@ -77,7 +77,7 @@ $$
 \text{objective} (\theta) = \mathbb{E}_{(x,y) \sim \mathcal{D}_{\pi^{\text{RL}}_{\theta}}} \left[ r_{\theta}(x, y) - \lambda r_{\text{reg.}} \right] + \gamma \mathbb{E}_{x \sim \mathcal{D}_{\text{pretrain}}} \left[ \log(\pi^{\text{RL}}_{\theta}(x)) \right]
 $$
 
-[@pang2024iterative] proposed using using a negative log likelihood term to balance the optimization of Direct Preference Optimization (DPO).
+Recent work proposed using using a negative log likelihood term to balance the optimization of Direct Preference Optimization (DPO) [@pang2024iterative].
 Given the pairwise nature of the DPO loss, the same loss modification can be made to reward model training, constraining the model to predict accurate text (rumors from laboratories that did not publish the work).
 
 The optimization follows as a modification to DPO.
@@ -94,7 +94,7 @@ TODO: Make the above equations congruent with the rest of the notation on DPO.
 
 Controlling the optimization is less well defined in other parts of the RLHF stack.
 Most reward models have no regularization beyond the standard contrastive loss function.
-Direct Alignment Algorithms handle regulaization to KL distances differently, through the $\Beta$ parameter (see the chapter on Direct Alignment).
+Direct Alignment Algorithms handle regulaization to KL distances differently, through the $\beta$ parameter (see the chapter on Direct Alignment).
 
 Llama 2 proposed a margin loss for reward model training [@touvron2023llama]:
 
@@ -105,4 +105,4 @@ $$
 Where $m(r)$ is the numerical difference in delta between the ratings of two annotators.
 This is either achieved by having annotators rate the outputs on a numerical scale or by using a quantified ranking method, such as [Likert scales](https://en.wikipedia.org/wiki/Likert_scale).
 
-Reward margins have been used heavily in the direct alignment literature, such as Reward weighted DPO, ``Reward-aware Preference Optimization'' (RPO), which integrates reward model scores into the update rule following a DPO loss [@adler2024nemotron], or REBEL [@gao2024rebel] that has a reward delta weighting in a regression-loss formulation.
+Reward margins have been used heavily in the direct alignment literature, such as Reward weighted DPO, ''Reward-aware Preference Optimization'' (RPO), which integrates reward model scores into the update rule following a DPO loss [@adler2024nemotron], or REBEL [@gao2024rebel] that has a reward delta weighting in a regression-loss formulation.
diff --git a/chapters/bib.bib b/chapters/bib.bib
index 783bc34..fbcac31 100644
--- a/chapters/bib.bib
+++ b/chapters/bib.bib
@@ -65,6 +65,13 @@ @article{stiennon2020learning
   year={2020}
 }
 
+@article{askell2021general,
+  title={A general language assistant as a laboratory for alignment},
+  author={Askell, Amanda and Bai, Yuntao and Chen, Anna and Drain, Dawn and Ganguli, Deep and Henighan, Tom and Jones, Andy and Joseph, Nicholas and Mann, Ben and DasSarma, Nova and others},
+  journal={arXiv preprint arXiv:2112.00861},
+  year={2021}
+}
+
 @article{nakano2021webgpt,
   title={Webgpt: Browser-assisted question-answering with human feedback},
   author={Nakano, Reiichiro and Hilton, Jacob and Balaji, Suchir and Wu, Jeff and Ouyang, Long and Kim, Christina and Hesse, Christopher and Jain, Shantanu and Kosaraju, Vineet and Saunders, William and others},
@@ -101,7 +108,27 @@ @article{touvron2023llama
   year={2023}
 }
 
+@article{adler2024nemotron,
+  title={Nemotron-4 340B Technical Report},
+  author={Adler, Bo and Agarwal, Niket and Aithal, Ashwath and Anh, Dong H and Bhattacharya, Pallab and Brundyn, Annika and Casper, Jared and Catanzaro, Bryan and Clay, Sharon and Cohen, Jonathan and others},
+  journal={arXiv preprint arXiv:2406.11704},
+  year={2024}
+}
+
 # RLHF More ########################################################################
+@article{pang2024iterative,
+  title={Iterative reasoning preference optimization},
+  author={Pang, Richard Yuanzhe and Yuan, Weizhe and Cho, Kyunghyun and He, He and Sukhbaatar, Sainbayar and Weston, Jason},
+  journal={arXiv preprint arXiv:2404.19733},
+  year={2024}
+}
+
+@article{gao2024rebel,
+  title={Rebel: Reinforcement learning via regressing relative rewards},
+  author={Gao, Zhaolin and Chang, Jonathan D and Zhan, Wenhao and Oertell, Owen and Swamy, Gokul and Brantley, Kiant{\'e} and Joachims, Thorsten and Bagnell, J Andrew and Lee, Jason D and Sun, Wen},
+  journal={arXiv preprint arXiv:2404.16767},
+  year={2024}
+}
 
 # LLM as a Judge ####################################################################
 @article{zheng2023judging,