diff --git a/aaa.cmake b/aaa.cmake new file mode 100644 index 0000000000..18d6981fd2 --- /dev/null +++ b/aaa.cmake @@ -0,0 +1,27 @@ +cmake_minimum_required(VERSION 3.19) +project(nanogpt_runner) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +# Set options for executorch build. +option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) +option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) +option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) + +# Include the executorch subdirectory. +add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch + ${CMAKE_BINARY_DIR}/executorch +) + +add_executable(nanogpt_runner main.cpp) +target_link_libraries( + nanogpt_runner + PRIVATE executorch + extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class + optimized_native_cpu_ops_lib # Provides baseline cross-platform + # kernels +) diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile index e662105b83..0d60bf51c7 100644 --- a/docs/source/Doxyfile +++ b/docs/source/Doxyfile @@ -943,7 +943,8 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = ../runtime/executor/memory_manager.h \ +INPUT = ../devtools/bundled_program/bundled_program.h \ + ../runtime/executor/memory_manager.h \ ../runtime/executor/method.h \ ../runtime/executor/method_meta.h \ ../runtime/executor/program.h \ diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md index 9751dc066f..45a7ecafce 100644 --- a/docs/source/build-run-coreml.md +++ b/docs/source/build-run-coreml.md @@ -147,11 +147,10 @@ libsqlite3.tbd 7. Update the code to load the program from the Application's bundle. ``` objective-c -using namespace torch::executor; - NSURL *model_url = [NBundle.mainBundle URLForResource:@"mv3_coreml_all" extension:@"pte"]; -Result loader = util::FileDataLoader::from(model_url.path.UTF8String); +Result loader = + executorch::extension::FileDataLoader::from(model_url.path.UTF8String); ``` 8. Use [Xcode](https://developer.apple.com/documentation/xcode/building-and-running-an-app#Build-run-and-debug-your-app) to deploy the application on the device. diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md index 74c86cb8cc..08961db776 100644 --- a/docs/source/bundled-io.md +++ b/docs/source/bundled-io.md @@ -201,14 +201,14 @@ This stage mainly focuses on executing the model with the bundled inputs and and ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer We need the pointer to ExecuTorch program to do the execution. To unify the process of loading and executing `BundledProgram` and Program flatbuffer, we create an API: -:::{dropdown} `GetProgramData` +:::{dropdown} `get_program_data` ```{eval-rst} -.. doxygenfunction:: torch::executor::bundled_program::GetProgramData +.. doxygenfunction:: ::executorch::bundled_program::get_program_data ``` ::: -Here's an example of how to use the `GetProgramData` API: +Here's an example of how to use the `get_program_data` API: ```c++ // Assume that the user has read the contents of the file into file_data using // whatever method works best for their application. The file could contain @@ -216,36 +216,36 @@ Here's an example of how to use the `GetProgramData` API: void* file_data = ...; size_t file_data_len = ...; -// If file_data contains a BundledProgram, GetProgramData() will return a +// If file_data contains a BundledProgram, get_program_data() will return a // pointer to the Program data embedded inside it. Otherwise it will return // file_data, which already pointed to Program data. const void* program_ptr; size_t program_len; -status = torch::executor::bundled_program::GetProgramData( +status = executorch::bundled_program::get_program_data( file_data, file_data_len, &program_ptr, &program_len); ET_CHECK_MSG( status == Error::Ok, - "GetProgramData() failed with status 0x%" PRIx32, + "get_program_data() failed with status 0x%" PRIx32, status); ``` ### Load Bundled Input to Method -To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `torch::executor::bundled_program::LoadBundledInput`: +To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `executorch::bundled_program::load_bundled_input`: -:::{dropdown} `LoadBundledInput` +:::{dropdown} `load_bundled_input` ```{eval-rst} -.. doxygenfunction:: torch::executor::bundled_program::LoadBundledInput +.. doxygenfunction:: ::executorch::bundled_program::load_bundled_input ``` ::: ### Verify the Method's Output. -We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput` to verify the method's output with bundled expected outputs. Here's the details of this API: +We call `executorch::bundled_program::verify_method_outputs` to verify the method's output with bundled expected outputs. Here's the details of this API: -:::{dropdown} `VerifyResultWithBundledExpectedOutput` +:::{dropdown} `verify_method_outputs` ```{eval-rst} -.. doxygenfunction:: torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput +.. doxygenfunction:: ::executorch::bundled_program::verify_method_outputs ``` ::: @@ -266,13 +266,13 @@ ET_CHECK_MSG( method.error()); // Load testset_idx-th input in the buffer to plan -status = torch::executor::bundled_program::LoadBundledInput( +status = executorch::bundled_program::load_bundled_input( *method, program_data.bundled_program_data(), FLAGS_testset_idx); ET_CHECK_MSG( status == Error::Ok, - "LoadBundledInput failed with status 0x%" PRIx32, + "load_bundled_input failed with status 0x%" PRIx32, status); // Execute the plan @@ -283,7 +283,7 @@ ET_CHECK_MSG( status); // Verify the result. -status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput( +status = executorch::bundled_program::verify_method_outputs( *method, program_data.bundled_program_data(), FLAGS_testset_idx, diff --git a/docs/source/concepts.md b/docs/source/concepts.md index c085505b61..289ecda6d8 100644 --- a/docs/source/concepts.md +++ b/docs/source/concepts.md @@ -26,7 +26,7 @@ The goal of ATen dialect is to capture users’ programs as faithfully as possib ## ATen mode -ATen mode uses the ATen implementation of Tensor (`at::Tensor`) and related types, such as `ScalarType`, from the PyTorch core. This is in contrast to portable mode, which uses ExecuTorch’s smaller implementation of tensor (`torch::executor::Tensor`) and related types, such as `torch::executor::ScalarType`. +ATen mode uses the ATen implementation of Tensor (`at::Tensor`) and related types, such as `ScalarType`, from the PyTorch core. This is in contrast to ETensor mode, which uses ExecuTorch’s smaller implementation of tensor (`executorch::runtime::etensor::Tensor`) and related types, such as `executorch::runtime::etensor::ScalarType`. - ATen kernels that rely on the full `at::Tensor` API are usable in this configuration. - ATen kernels tend to do dynamic memory allocation and often have extra flexibility (and thus overhead) to handle cases not needed by mobile/embedded clients. e.g., CUDA support, sparse tensor support, and dtype promotion. - Note: ATen mode is currently a WIP. @@ -244,10 +244,10 @@ Kernels that support a subset of tensor dtypes and/or dim orders. Parts of a model may be delegated to run on an optimized backend. The partitioner splits the graph into the appropriate sub-networks and tags them for delegation. -## Portable mode (lean mode) +## ETensor mode -Portable mode uses ExecuTorch’s smaller implementation of tensor (`torch::executor::Tensor`) along with related types (`torch::executor::ScalarType`, etc.). This is in contrast to ATen mode, which uses the ATen implementation of Tensor (`at::Tensor`) and related types (`ScalarType`, etc.) -- `torch::executor::Tensor`, also known as ETensor, is a source-compatible subset of `at::Tensor`. Code written against ETensor can build against `at::Tensor`. +ETensor mode uses ExecuTorch’s smaller implementation of tensor (`executorch::runtime::etensor::Tensor`) along with related types (`executorch::runtime::etensor::ScalarType`, etc.). This is in contrast to ATen mode, which uses the ATen implementation of Tensor (`at::Tensor`) and related types (`ScalarType`, etc.) +- `executorch::runtime::etensor::Tensor`, also known as ETensor, is a source-compatible subset of `at::Tensor`. Code written against ETensor can build against `at::Tensor`. - ETensor does not own or allocate memory on its own. To support dynamic shapes, kernels can allocate Tensor data using the MemoryAllocator provided by the client. ## Portable kernels diff --git a/docs/source/etdump.md b/docs/source/etdump.md index 42391cf40e..13957f1c01 100644 --- a/docs/source/etdump.md +++ b/docs/source/etdump.md @@ -15,7 +15,7 @@ Generating an ETDump is a relatively straightforward process. Users can follow t 2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime. ```C++ -torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen(); +executorch::etdump::ETDumpGen etdump_gen; Result method = program->load_method(method_name, &memory_manager, &etdump_gen); ``` diff --git a/docs/source/executorch-runtime-api-reference.rst b/docs/source/executorch-runtime-api-reference.rst index ef9a99e593..5bec597987 100644 --- a/docs/source/executorch-runtime-api-reference.rst +++ b/docs/source/executorch-runtime-api-reference.rst @@ -11,25 +11,25 @@ For detailed information on how APIs evolve and the deprecation process, please Model Loading and Execution --------------------------- -.. doxygenclass:: executorch::runtime::DataLoader +.. doxygenclass:: executorch::runtime::Program :members: -.. doxygenclass:: executorch::runtime::MemoryAllocator +.. doxygenclass:: executorch::runtime::Method :members: -.. doxygenclass:: executorch::runtime::HierarchicalAllocator +.. doxygenclass:: executorch::runtime::MethodMeta :members: -.. doxygenclass:: executorch::runtime::MemoryManager +.. doxygenclass:: executorch::runtime::DataLoader :members: -.. doxygenclass:: executorch::runtime::Program +.. doxygenclass:: executorch::runtime::MemoryAllocator :members: -.. doxygenclass:: executorch::runtime::Method +.. doxygenclass:: executorch::runtime::HierarchicalAllocator :members: -.. doxygenclass:: executorch::runtime::MethodMeta +.. doxygenclass:: executorch::runtime::MemoryManager :members: Values @@ -38,5 +38,5 @@ Values .. doxygenstruct:: executorch::runtime::EValue :members: -.. doxygenclass:: executorch::aten::Tensor +.. doxygenclass:: executorch::runtime::etensor::Tensor :members: diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 71b13598ed..2cdf13ca65 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -43,14 +43,13 @@ cd et-nanogpt # Clone the ExecuTorch repository and submodules. mkdir third-party -git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch +git clone -b release/0.4 https://github.com/pytorch/executorch.git third-party/executorch cd third-party/executorch git submodule update --init # Create a conda environment and install requirements. conda create -yn executorch python=3.10.0 conda activate executorch -pip install cmake zstd ./install_requirements.sh cd ../.. @@ -77,12 +76,11 @@ pyenv activate executorch # Clone the ExecuTorch repository and submodules. mkdir third-party -git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch +git clone -b release/0.4 https://github.com/pytorch/executorch.git third-party/executorch cd third-party/executorch git submodule update --init # Install requirements. -pip install cmake zstd PYTHON_EXECUTABLE=python ./install_requirements.sh cd ../.. @@ -208,8 +206,8 @@ Create a file called main.cpp with the following contents: #include #include -using exec_aten::ScalarType; -using exec_aten::Tensor; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using executorch::extension::from_blob; using executorch::extension::Module; using executorch::runtime::EValue; @@ -235,56 +233,56 @@ std::string generate( BasicSampler& sampler, size_t max_input_length, size_t max_output_length) { + // Convert the input text into a list of integers (tokens) that represents it, + // using the string-to-token mapping that the model was trained on. Each token + // is an integer that represents a word or part of a word. + std::vector input_tokens = tokenizer.encode(prompt); + std::vector output_tokens; + + for (auto i = 0u; i < max_output_length; i++) { + // Convert the input_tokens from a vector of int64_t to EValue. EValue is a + // unified data type in the ExecuTorch runtime. + auto inputs = from_blob( + input_tokens.data(), + {1, static_cast(input_tokens.size())}, + ScalarType::Long); + + // Run the model. It will return a tensor of logits (log-probabilities). + auto logits_evalue = llm_model.forward(inputs); + + // Convert the output logits from EValue to std::vector, which is what the + // sampler expects. + Tensor logits_tensor = logits_evalue.get()[0].toTensor(); + std::vector logits( + logits_tensor.data_ptr(), + logits_tensor.data_ptr() + logits_tensor.numel()); + + // Sample the next token from the logits. + int64_t next_token = sampler.sample(logits); + + // Break if we reached the end of the text. + if (next_token == ENDOFTEXT_TOKEN) { + break; + } + + // Add the next token to the output. + output_tokens.push_back(next_token); + + std::cout << tokenizer.decode({next_token}); + std::cout.flush(); - // Convert the input text into a list of integers (tokens) that represents - // it, using the string-to-token mapping that the model was trained on. - // Each token is an integer that represents a word or part of a word. - std::vector input_tokens = tokenizer.encode(prompt); - std::vector output_tokens; - - for (auto i = 0u; i < max_output_length; i++) { - // Convert the input_tokens from a vector of int64_t to EValue. - // EValue is a unified data type in the ExecuTorch runtime. - auto inputs = from_blob( - input_tokens.data(), - {1, static_cast(input_tokens.size())}, - ScalarType::Long); - - // Run the model. It will return a tensor of logits (log-probabilities). - auto logits_evalue = llm_model.forward(inputs); - - // Convert the output logits from EValue to std::vector, which is what - // the sampler expects. - Tensor logits_tensor = logits_evalue.get()[0].toTensor(); - std::vector logits(logits_tensor.data_ptr(), - logits_tensor.data_ptr() + logits_tensor.numel()); - - // Sample the next token from the logits. - int64_t next_token = sampler.sample(logits); - - // Break if we reached the end of the text. - if (next_token == ENDOFTEXT_TOKEN) { - break; - } - - // Add the next token to the output. - output_tokens.push_back(next_token); - - std::cout << tokenizer.decode({ next_token }); - std::cout.flush(); - - // Update next input. - input_tokens.push_back(next_token); - if (input_tokens.size() > max_input_length) { - input_tokens.erase(input_tokens.begin()); - } + // Update next input. + input_tokens.push_back(next_token); + if (input_tokens.size() > max_input_length) { + input_tokens.erase(input_tokens.begin()); } + } - std::cout << std::endl; + std::cout << std::endl; - // Convert the output tokens into a human-readable string. - std::string output_string = tokenizer.decode(output_tokens); - return output_string; + // Convert the output tokens into a human-readable string. + std::string output_string = tokenizer.decode(output_tokens); + return output_string; } ``` @@ -309,32 +307,32 @@ penalties for repeated tokens, and biases to prioritize or de-prioritize specifi ```cpp // main.cpp -using namespace torch::executor; - int main() { - // Set up the prompt. This provides the seed text for the model to elaborate. - std::cout << "Enter model prompt: "; - std::string prompt; - std::getline(std::cin, prompt); - - // The tokenizer is used to convert between tokens (used by the model) and - // human-readable strings. - BasicTokenizer tokenizer("vocab.json"); - - // The sampler is used to sample the next token from the logits. - BasicSampler sampler = BasicSampler(); - - // Load the exported nanoGPT program, which was generated via the previous steps. - Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors); - - const auto max_input_tokens = 1024; - const auto max_output_tokens = 30; - std::cout << prompt; - generate(model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens); + // Set up the prompt. This provides the seed text for the model to elaborate. + std::cout << "Enter model prompt: "; + std::string prompt; + std::getline(std::cin, prompt); + + // The tokenizer is used to convert between tokens (used by the model) and + // human-readable strings. + BasicTokenizer tokenizer("vocab.json"); + + // The sampler is used to sample the next token from the logits. + BasicSampler sampler = BasicSampler(); + + // Load the exported nanoGPT program, which was generated via the previous + // steps. + Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors); + + const auto max_input_tokens = 1024; + const auto max_output_tokens = 30; + std::cout << prompt; + generate( + model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens); } ``` -Finally, download the following files into the same directory as main.h: +Finally, download the following files into the same directory as main.cpp: ``` curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h @@ -368,17 +366,19 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) # Include the executorch subdirectory. add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch - ${CMAKE_BINARY_DIR}/third-party/executorch) + ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch + ${CMAKE_BINARY_DIR}/executorch +) add_executable(nanogpt_runner main.cpp) target_link_libraries( - nanogpt_runner - PRIVATE - executorch - extension_module_static # Provides the Module class - extension_tensor # Provides the TensorPtr class - optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels + nanogpt_runner + PRIVATE executorch + extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class + optimized_native_cpu_ops_lib # Provides baseline cross-platform + # kernels +) ``` At this point, the working directory should contain the following files: @@ -524,20 +524,20 @@ option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend # Include the executorch subdirectory. add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch - ${CMAKE_BINARY_DIR}/executorch) - -# include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) + ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch + ${CMAKE_BINARY_DIR}/executorch +) add_executable(nanogpt_runner main.cpp) target_link_libraries( - nanogpt_runner - PRIVATE - executorch - extension_module_static # Provides the Module class - extension_tensor # Provides the TensorPtr class - optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels - xnnpack_backend) # Provides the XNNPACK CPU acceleration backend + nanogpt_runner + PRIVATE executorch + extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class + optimized_native_cpu_ops_lib # Provides baseline cross-platform + # kernels + xnnpack_backend # Provides the XNNPACK CPU acceleration backend +) ``` Keep the rest of the code the same. For more details refer to [Exporting diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md index edd5d95086..70e17c8222 100644 --- a/docs/source/running-a-model-cpp-tutorial.md +++ b/docs/source/running-a-model-cpp-tutorial.md @@ -24,14 +24,25 @@ Users can define their own `DataLoader`s to fit the needs of their particular sy For the `FileDataLoader` all we need to do is provide a file path to the constructor. ``` cpp -using namespace torch::executor; - -Result loader = - util::FileDataLoader::from("/tmp/model.pte"); +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::extension::FileDataLoader; +using executorch::extension::MallocMemoryAllocator; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; + +Result loader = + FileDataLoader::from("/tmp/model.pte"); assert(loader.ok()); -Result program = - torch::executor::Program::load(&loader.get()); +Result program = Program::load(&loader.get()); assert(program.ok()); ``` @@ -48,14 +59,13 @@ One of the principles of ExecuTorch is giving users control over where the memor For this example we will retrieve the size of the planned memory arenas dynamically from the `Program`, but for heapless environments users could retrieve this information from the `Program` ahead of time and allocate the arena statically. We will also be using a malloc based allocator for the method allocator. ``` cpp - -// Method names map back to Python nn.Module method names. Most users will only have the singular method "forward". +// Method names map back to Python nn.Module method names. Most users will only +// have the singular method "forward". const char* method_name = "forward"; // MethodMeta is a lightweight structure that lets us gather metadata -// information about a specific method. In this case we are looking to -// get the required size of the memory planned buffers for the method -// "forward". +// information about a specific method. In this case we are looking to get the +// required size of the memory planned buffers for the method "forward". Result method_meta = program->method_meta(method_name); assert(method_meta.ok()); @@ -64,7 +74,8 @@ std::vector> planned_arenas; // Passed to the allocator size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); -// It is possible to have multiple layers in our memory hierarchy; for example, SRAM and DRAM. +// It is possible to have multiple layers in our memory hierarchy; for example, +// SRAM and DRAM. for (size_t id = 0; id < num_memory_planned_buffers; ++id) { // .get() will always succeed because id < num_memory_planned_buffers. size_t buffer_size = @@ -75,12 +86,12 @@ for (size_t id = 0; id < num_memory_planned_buffers; ++id) { HierarchicalAllocator planned_memory( {planned_arenas.data(), planned_arenas.size()}); -// Version of MemoryAllocator that uses malloc to handle allocations -// rather then a fixed buffer. -util::MallocMemoryAllocator method_allocator; +// Version of MemoryAllocator that uses malloc to handle allocations rather then +// a fixed buffer. +MallocMemoryAllocator method_allocator; -// Assemble all of the allocators into the MemoryManager that the Executor -// will use. +// Assemble all of the allocators into the MemoryManager that the Executor will +// use. MemoryManager memory_manager(&method_allocator, &planned_memory); ``` diff --git a/examples/llm_manual/CMakeLists.txt b/examples/llm_manual/CMakeLists.txt index e5054a683a..1283eb548e 100644 --- a/examples/llm_manual/CMakeLists.txt +++ b/examples/llm_manual/CMakeLists.txt @@ -23,8 +23,6 @@ add_subdirectory( ${CMAKE_BINARY_DIR}/executorch ) -# include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) - add_executable(nanogpt_runner main.cpp) target_link_libraries( nanogpt_runner @@ -33,5 +31,5 @@ target_link_libraries( extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib # Provides baseline cross-platform # kernels - xnnpack_backend -) # Provides the XNNPACK CPU acceleration backend + xnnpack_backend # Provides the XNNPACK CPU acceleration backend +) diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp index 5bd318983b..76492513f9 100644 --- a/examples/llm_manual/main.cpp +++ b/examples/llm_manual/main.cpp @@ -17,15 +17,15 @@ #include #include -using exec_aten::ScalarType; -using exec_aten::Tensor; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using executorch::extension::from_blob; using executorch::extension::Module; using executorch::runtime::EValue; using executorch::runtime::Result; // The value of the gpt2 `<|endoftext|>` token. -#define ENDOFTEXT 50256 +#define ENDOFTEXT_TOKEN 50256 std::string generate( Module& llm_model, @@ -34,15 +34,15 @@ std::string generate( BasicSampler& sampler, size_t max_input_length, size_t max_output_length) { - // Convert the input text into a list of integers (tokens) that represents - // it, using the string-to-token mapping that the model was trained on. - // Each token is an integer that represents a word or part of a word. + // Convert the input text into a list of integers (tokens) that represents it, + // using the string-to-token mapping that the model was trained on. Each token + // is an integer that represents a word or part of a word. std::vector input_tokens = tokenizer.encode(prompt); std::vector output_tokens; for (auto i = 0u; i < max_output_length; i++) { - // Convert the input_tokens from a vector of int64_t to EValue. - // EValue is a unified data type in the ExecuTorch runtime. + // Convert the input_tokens from a vector of int64_t to EValue. EValue is a + // unified data type in the ExecuTorch runtime. auto inputs = from_blob( input_tokens.data(), {1, static_cast(input_tokens.size())}, @@ -51,8 +51,8 @@ std::string generate( // Run the model. It will return a tensor of logits (log-probabilities). auto logits_evalue = llm_model.forward(inputs); - // Convert the output logits from EValue to std::vector, which is what - // the sampler expects. + // Convert the output logits from EValue to std::vector, which is what the + // sampler expects. Tensor logits_tensor = logits_evalue.get()[0].toTensor(); std::vector logits( logits_tensor.data_ptr(), @@ -62,7 +62,7 @@ std::string generate( int64_t next_token = sampler.sample(logits); // Break if we reached the end of the text. - if (next_token == ENDOFTEXT) { + if (next_token == ENDOFTEXT_TOKEN) { break; } @@ -86,11 +86,9 @@ std::string generate( return output_string; } -// main.cpp - int main() { // Set up the prompt. This provides the seed text for the model to elaborate. - std::cout << "Prompt: "; + std::cout << "Enter model prompt: "; std::string prompt; std::getline(std::cin, prompt);