diff --git a/README.md b/README.md index 93c59321d..ed53b4211 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,30 @@ See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install) del generator ``` +### Choosing the Right Examples: Release vs. Main Branch + +Due to evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version. Majority of the steps would remain same, just the package installation and the model example file would change. + +### Stable version +Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). Let's say you installed 0.5.2 version of ONNX Runtime GenAI, so the instructions would look like this: + +```bash +# Clone the repo +git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai +# Checkout the branch for the version you are using +git checkout v0.5.2 +cd examples +``` + +### Nightly version (Main Branch) +Build the package from source using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). Now just go to the folder location where all the examples are present. + +```bash +# Clone the repo +git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai +cd examples +``` + ## Roadmap See the [Discussions](https://github.com/microsoft/onnxruntime-genai/discussions) to request new features and up-vote existing requests. diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py index e5e449a77..58c265f18 100644 --- a/examples/python/model-qa.py +++ b/examples/python/model-qa.py @@ -56,10 +56,6 @@ def main(args): else: raise ValueError(f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template") - params = og.GeneratorParams(model) - params.set_search_options(**search_options) - generator = og.Generator(model, params) - # Set system prompt if "<|" in args.system_prompt and "|>" in args.system_prompt: # User-provided system template already has tags @@ -79,7 +75,6 @@ def main(args): system_prompt = args.system_prompt system_tokens = tokenizer.encode(system_prompt) - generator.append_tokens(system_tokens) system_prompt_length = len(system_tokens) # Keep asking for input prompts in a loop @@ -93,6 +88,12 @@ def main(args): prompt = f'{args.chat_template.format(input=text)}' input_tokens = tokenizer.encode(prompt) + + params = og.GeneratorParams(model) + params.set_search_options(**search_options) + generator = og.Generator(model, params) + # Append system tokens to the generator + generator.append_tokens(system_tokens) generator.append_tokens(input_tokens) if args.verbose: print("Generator created") @@ -121,14 +122,14 @@ def main(args): print() print() + # Delete the generator to free the captured graph for the next generator, if graph capture is enabled + + del generator + if args.timings: prompt_time = first_token_timestamp - started_timestamp run_time = time.time() - first_token_timestamp print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps") - - # Rewind the generator to the system prompt, this will erase all the memory of the model. - if args.rewind: - generator.rewind_to(system_prompt_length) if __name__ == "__main__": parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") @@ -145,6 +146,5 @@ def main(args): parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}') parser.add_argument('-s', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt to use for the prompt.') - parser.add_argument('-r', '--rewind', action='store_true', default=True, help='Rewind to the system prompt after each generation. Defaults to true') args = parser.parse_args() main(args) diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index 5b32f68d8..32b09375d 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -66,10 +66,11 @@ Are you on a Windows machine with GPU? 3. Run the model - Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). + Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below). ```bash curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py + # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py python phi3-qa.py -m directml\directml-int4-awq-block-128 -e dml ``` @@ -105,10 +106,11 @@ Are you on a Windows machine with GPU? 3. Run the model - Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). + Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below). ```bash curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py + # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py python phi3-qa.py -m cuda/cuda-int4-rtn-block-32 -e cuda ``` @@ -138,10 +140,11 @@ Are you on a Windows machine with GPU? 3. Run the model - Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). + Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). If you are using a stable version of ONNX Runtime GenAI then you need to download from the release branch (example for v0.5.2 is given below). ```bash curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py + # For stable release v0.5.2: curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/v0.5.2/examples/python/phi3-qa.py -o phi3-qa.py python phi3-qa.py -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 -e cpu ```