microsoft · ajindal1 · Feb 1, 2025 · Jan 27, 2025 · Jan 28, 2025 · Jan 29, 2025
diff --git a/README.md b/README.md
@@ -143,6 +143,30 @@ See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install)
    del generator
    ```
 
+### Choosing the Right Examples: Release vs. Main Branch
+
+Due to evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version. Majority of the steps would remain same, just the package installation and the model example file would change.
+
+### Stable version
+Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). Let's say you installed 0.5.2 version of ONNX Runtime GenAI, so the instructions would look like this:
+
+```bash
+# Clone the repo
+git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
+# Checkout the branch for the version you are using
+git checkout v0.5.2
+cd examples
+```
+
+### Nightly version (Main Branch)
+Build the package from source using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). Now just go to the folder location where all the examples are present.
+
+```bash
+# Clone the repo
+git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
+cd examples
+```
+
 ## Roadmap
 
 See the [Discussions](https://github.com/microsoft/onnxruntime-genai/discussions) to request new features and up-vote existing requests.

diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
@@ -42,10 +42,6 @@ def main(args):
         else:
             raise ValueError(f"Chat Template for model type {model.type} is not known. Please provide chat template using --chat_template")
 
-    params = og.GeneratorParams(model)
-    params.set_search_options(**search_options)
-    generator = og.Generator(model, params)
-
     # Set system prompt
     if model.type.startswith('phi2') or model.type.startswith('phi3'):
         system_prompt = f"<|system|>\n{args.system_prompt}<|end|>"
@@ -59,7 +55,6 @@ def main(args):
         system_prompt = args.system_prompt
 
     system_tokens = tokenizer.encode(system_prompt)
-    generator.append_tokens(system_tokens)
     system_prompt_length = len(system_tokens)
 
     # Keep asking for input prompts in a loop
@@ -77,6 +72,12 @@ def main(args):
             prompt = f'{args.chat_template.format(input=text)}'
 
         input_tokens = tokenizer.encode(prompt)
+
+        params = og.GeneratorParams(model)
+        params.set_search_options(**search_options)
+        generator = og.Generator(model, params)
+        # Append system tokens to the generator
+        generator.append_tokens(system_tokens)
 
         generator.append_tokens(input_tokens)
         if args.verbose: print("Generator created")
@@ -105,14 +106,14 @@ def main(args):
         print()
         print()
 
+        # Delete the generator to free the captured graph for the next generator, if graph capture is enabled
+
+        del generator
+
         if args.timings:
             prompt_time = first_token_timestamp - started_timestamp
             run_time = time.time() - first_token_timestamp
             print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps")
-
-        # Rewind the generator to the system prompt, this will erase all the memory of the model.
-        if args.rewind:
-            generator.rewind_to(system_prompt_length)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai")
@@ -129,6 +130,5 @@ def main(args):
     parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
     parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}')
     parser.add_argument('-s', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt to use for the prompt.')
-    parser.add_argument('-r', '--rewind', action='store_true', default=True, help='Rewind to the system prompt after each generation. Defaults to true')
     args = parser.parse_args()
     main(args)
diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
@@ -66,10 +66,11 @@ Are you on a Windows machine with GPU?
 
 3. Run the model
 
-   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py).
+   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below).
 
    ```bash
    curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
+   # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py
    python phi3-qa.py -m directml\directml-int4-awq-block-128 -e dml
    ```
 
@@ -105,10 +106,11 @@ Are you on a Windows machine with GPU?
 
 3. Run the model
 
-   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py).
+   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below).
 
    ```bash
    curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
+   # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py
    python phi3-qa.py -m cuda/cuda-int4-rtn-block-32 -e cuda
    ```
 
@@ -138,10 +140,11 @@ Are you on a Windows machine with GPU?
 
 3. Run the model
 
-   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py).
+   Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below).
 
    ```bash
    curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
+   # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py
    python phi3-qa.py -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 -e cpu
    ```