From cfe91f47ca95cbe1f0f1b19ea32bb7646b98965e Mon Sep 17 00:00:00 2001
From: Songhao Jia <gasoonjia@meta.com>
Date: Wed, 24 Apr 2024 02:51:50 -0700
Subject: [PATCH] add dynamic export into llm manual (#3202)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3202

This diff adds dynamic export into llm manual, including code and related comments.
Also update other documentations for better understanding.

Reviewed By: dbort

Differential Revision: D56365041

fbshipit-source-id: 5ce4c15206a2923c4d54811cefca03f72869c719
(cherry picked from commit 66a350b1bbf7a199b33778be17fef6847ffc37c0)
---
 docs/source/llm/getting-started.md | 106 ++++++++++++++++++++++++-----
 1 file changed, 89 insertions(+), 17 deletions(-)

diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index 3bff5c903f..2404b07790 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -1,5 +1,18 @@
 # Getting Started with LLMs via ExecuTorch
 
+Welcome to LLM Manual! This manual is designed to provide a practical example to leverage
+ExecuTorch in onboarding your own Large Language Models (LLMs). Our primary goal is to offer
+ a clear and concise guideline on how to integrate our system with your own LLMs.
+
+Please note that this project is intended as a demonstration and not as a fully functional
+example with optimal performance. As such, certain components such as the sampler, tokenizer,
+and others are provided in their bare minimum versions solely for demonstration purposes.
+Consequently, the results produced by the model may vary and might not always be optimal.
+
+We encourage users to use this project as a starting point and adapt it to their specific needs,
+which includes creating your own versions of the tokenizer, sampler, acceleration backends, and
+other components. We hope this project serves as a useful guide in your journey with LLMs and ExecuTorch.
+
 ### Table Of Contents
 
 
@@ -141,13 +154,24 @@ model = GPT.from_pretrained('gpt2')
 
 # Create example inputs. This is used in the export process to provide
 # hints on the expected shape of the model input.
-example_inputs = (torch.randint(0, 100, (1, 8), dtype=torch.long), )
+example_inputs = (torch.randint(0, 100, (1, model.config.block_size), dtype=torch.long), )
+
+# Set up dynamic shape configuration. This allows the sizes of the input tensors
+# to differ from the sizes of the tensors in `example_inputs` during runtime, as
+# long as they adhere to the rules specified in the dynamic shape configuration.
+# Here we set the range of 0th model input's 1st dimension as
+# [0, model.config.block_size].
+# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# for details about creating dynamic shapes.
+dynamic_shape = (
+    {1: torch.export.Dim("token_dim", max=model.config.block_size)},
+)
 
 # Trace the model, converting it to a portable intermediate representation.
 # The torch.no_grad() call tells PyTorch to exclude training-specific logic.
 with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
-    m = capture_pre_autograd_graph(model, example_inputs)
-    traced_model = export(m, example_inputs)
+    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
 
 # Convert the model into a runnable ExecuTorch program.
 edge_config = EdgeCompileConfig(_check_ir_validity=False)
@@ -204,11 +228,15 @@ output token by token. Each generated token is passed as input for the next run.
 ```cpp
 // main.cpp
 
+// The value of the gpt2 `<|endoftext|>` token.
+#define ENDOFTEXT_TOKEN 50256
+
 std::string generate(
     Module& llm_model,
     std::string& prompt,
     BasicTokenizer& tokenizer,
     BasicSampler& sampler,
+    size_t max_input_length,
     size_t max_output_length) {
 
     // Convert the input text into a list of integers (tokens) that represents
@@ -237,14 +265,23 @@ std::string generate(
 
         // Sample the next token from the logits.
         int64_t next_token = sampler.sample(logits);
+
+        // Break if we reached the end of the text.
+        if (next_token == ENDOFTEXT_TOKEN) {
+            break;
+        }
+
+        // Add the next token to the output.
         output_tokens.push_back(next_token);
 
         std::cout << tokenizer.decode({ next_token });
         std::cout.flush();
 
         // Update next input.
-        input_tokens.erase(input_tokens.begin());
         input_tokens.push_back(next_token);
+        if (input_tokens.size() > max_input_length) {
+            input_tokens.erase(input_tokens.begin());
+        }
     }
 
     std::cout << std::endl;
@@ -278,7 +315,9 @@ penalties for repeated tokens, and biases to prioritize or de-prioritize specifi
 
 int main() {
     // Set up the prompt. This provides the seed text for the model to elaborate.
-    std::string prompt = "Once upon a time, there was a";
+    std::cout << "Enter model prompt: ";
+    std::string prompt;
+    std::getline(std::cin, prompt);
 
     // The tokenizer is used to convert between tokens (used by the model) and
     // human-readable strings.
@@ -290,19 +329,19 @@ int main() {
     // Load the exported nanoGPT program, which was generated via the previous steps.
     Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
 
+    const auto max_input_tokens = 1024;
     const auto max_output_tokens = 30;
     std::cout << prompt;
-    generate(model, prompt, tokenizer, sampler, max_output_tokens);
+    generate(model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
 }
 ```
 
 Finally, download the following files into the same directory as main.h:
 
-TODO: This is a placeholder.
 ```
-curl -O https://raw.githubusercontent.com/GregoryComer/et-tutorials/quantization/nanogpt/managed_tensor.h
-curl -O https://raw.githubusercontent.com/GregoryComer/et-tutorials/quantization/nanogpt/basic_tokenizer.h
-curl -O https://raw.githubusercontent.com/GregoryComer/et-tutorials/quantization/nanogpt/basic_sampler.h
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h
 ```
 
 To learn more, see [Running an ExecuTorch Model in C++](https://pytorch.org/executorch/main/running-a-model-cpp-tutorial.html)
@@ -363,10 +402,20 @@ cmake --build cmake-out -j10
 ./cmake-out/nanogpt_runner
 ```
 
-You should see something like the following:
+You should see the message:
+
+```
+Enter model prompt:
+```
+
+Type some seed text for the model and press enter. Here we use "Hello world!" as
+an example prompt:
 
 ```
-Once upon a time, there was a man who was a member of the military...
+Enter model prompt: Hello world!
+Hello world!
+
+I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
 ```
 
 At this point, it is likely to run very slowly. This is because ExecuTorch hasn't been told to optimize for
@@ -423,14 +472,25 @@ model = GPT.from_pretrained('gpt2')
 # Create example inputs. This is used in the export process to provide
 # hints on the expected shape of the model input.
 example_inputs = (
-        torch.randint(0, 100, (1, 8), dtype=torch.long),
+        torch.randint(0, 100, (1, model.config.block_size - 1), dtype=torch.long),
     )
 
+# Set up dynamic shape configuration. This allows the sizes of the input tensors
+# to differ from the sizes of the tensors in `example_inputs` during runtime, as
+# long as they adhere to the rules specified in the dynamic shape configuration.
+# Here we set the range of 0th model input's 1st dimension as
+# [0, model.config.block_size].
+# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# for details about creating dynamic shapes.
+dynamic_shape = (
+    {1: torch.export.Dim("token_dim", max=model.config.block_size - 1)},
+)
+
 # Trace the model, converting it to a portable intermediate representation.
 # The torch.no_grad() call tells PyTorch to exclude training-specific logic.
 with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
-    m = capture_pre_autograd_graph(model, example_inputs)
-    traced_model = export(m, example_inputs)
+    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
 
 # Convert the model into a runnable ExecuTorch program.
 # To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
@@ -512,12 +572,24 @@ cmake --build cmake-out -j10
 ./cmake-out/nanogpt_runner
 ```
 
-You should see something like the following:
+
+You should see the message:
+
+```
+Enter model prompt:
+```
+
+Type some seed text for the model and press enter. Here we use "Hello world!" as
+an example prompt:
 
 ```
-Once upon a time, there was a man who was a member of the military...
+Enter model prompt: Hello world!
+Hello world!
+
+I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
 ```
 
+The delegated model should be noticeably faster compared to the non-delegated model.
 
 For more information regarding backend delegateion, see the ExecuTorch guides
 for the