diff --git a/aaa.cmake b/aaa.cmake
new file mode 100644
index 0000000000..18d6981fd2
--- /dev/null
+++ b/aaa.cmake
@@ -0,0 +1,27 @@
+cmake_minimum_required(VERSION 3.19)
+project(nanogpt_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Set options for executorch build.
+option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
+option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
+
+# Include the executorch subdirectory.
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+  ${CMAKE_BINARY_DIR}/executorch
+)
+
+add_executable(nanogpt_runner main.cpp)
+target_link_libraries(
+  nanogpt_runner
+  PRIVATE executorch
+          extension_module_static # Provides the Module class
+          extension_tensor # Provides the TensorPtr class
+          optimized_native_cpu_ops_lib # Provides baseline cross-platform
+                                       # kernels
+)
diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile
index e662105b83..0d60bf51c7 100644
--- a/docs/source/Doxyfile
+++ b/docs/source/Doxyfile
@@ -943,7 +943,8 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = ../runtime/executor/memory_manager.h \
+INPUT                  = ../devtools/bundled_program/bundled_program.h \
+                         ../runtime/executor/memory_manager.h \
                          ../runtime/executor/method.h \
                          ../runtime/executor/method_meta.h \
                          ../runtime/executor/program.h \
diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md
index 9751dc066f..45a7ecafce 100644
--- a/docs/source/build-run-coreml.md
+++ b/docs/source/build-run-coreml.md
@@ -147,11 +147,10 @@ libsqlite3.tbd
 
 7. Update the code to load the program from the Application's bundle.
 ``` objective-c
-using namespace torch::executor;
-
 NSURL *model_url = [NBundle.mainBundle URLForResource:@"mv3_coreml_all" extension:@"pte"];
 
-Result<util::FileDataLoader> loader = util::FileDataLoader::from(model_url.path.UTF8String);
+Result<executorch::extension::FileDataLoader> loader =
+    executorch::extension::FileDataLoader::from(model_url.path.UTF8String);
 ```
 
 8. Use [Xcode](https://developer.apple.com/documentation/xcode/building-and-running-an-app#Build-run-and-debug-your-app) to deploy the application on the device.
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
index 74c86cb8cc..08961db776 100644
--- a/docs/source/bundled-io.md
+++ b/docs/source/bundled-io.md
@@ -201,14 +201,14 @@ This stage mainly focuses on executing the model with the bundled inputs and and
 ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
 We need the pointer to ExecuTorch program to do the execution. To unify the process of loading and executing `BundledProgram` and Program flatbuffer, we create an API:
 
-:::{dropdown} `GetProgramData`
+:::{dropdown} `get_program_data`
 
 ```{eval-rst}
-.. doxygenfunction:: torch::executor::bundled_program::GetProgramData
+.. doxygenfunction:: ::executorch::bundled_program::get_program_data
 ```
 :::
 
-Here's an example of how to use the `GetProgramData` API:
+Here's an example of how to use the `get_program_data` API:
 ```c++
 // Assume that the user has read the contents of the file into file_data using
 // whatever method works best for their application. The file could contain
@@ -216,36 +216,36 @@ Here's an example of how to use the `GetProgramData` API:
 void* file_data = ...;
 size_t file_data_len = ...;
 
-// If file_data contains a BundledProgram, GetProgramData() will return a
+// If file_data contains a BundledProgram, get_program_data() will return a
 // pointer to the Program data embedded inside it. Otherwise it will return
 // file_data, which already pointed to Program data.
 const void* program_ptr;
 size_t program_len;
-status = torch::executor::bundled_program::GetProgramData(
+status = executorch::bundled_program::get_program_data(
     file_data, file_data_len, &program_ptr, &program_len);
 ET_CHECK_MSG(
     status == Error::Ok,
-    "GetProgramData() failed with status 0x%" PRIx32,
+    "get_program_data() failed with status 0x%" PRIx32,
     status);
 ```
 
 ### Load Bundled Input to Method
-To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `torch::executor::bundled_program::LoadBundledInput`:
+To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `executorch::bundled_program::load_bundled_input`:
 
-:::{dropdown} `LoadBundledInput`
+:::{dropdown} `load_bundled_input`
 
 ```{eval-rst}
-.. doxygenfunction:: torch::executor::bundled_program::LoadBundledInput
+.. doxygenfunction:: ::executorch::bundled_program::load_bundled_input
 ```
 :::
 
 ### Verify the Method's Output.
-We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput` to verify the method's output with bundled expected outputs. Here's the details of this API:
+We call `executorch::bundled_program::verify_method_outputs` to verify the method's output with bundled expected outputs. Here's the details of this API:
 
-:::{dropdown} `VerifyResultWithBundledExpectedOutput`
+:::{dropdown} `verify_method_outputs`
 
 ```{eval-rst}
-.. doxygenfunction:: torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput
+.. doxygenfunction:: ::executorch::bundled_program::verify_method_outputs
 ```
 :::
 
@@ -266,13 +266,13 @@ ET_CHECK_MSG(
     method.error());
 
 // Load testset_idx-th input in the buffer to plan
-status = torch::executor::bundled_program::LoadBundledInput(
+status = executorch::bundled_program::load_bundled_input(
         *method,
         program_data.bundled_program_data(),
         FLAGS_testset_idx);
 ET_CHECK_MSG(
     status == Error::Ok,
-    "LoadBundledInput failed with status 0x%" PRIx32,
+    "load_bundled_input failed with status 0x%" PRIx32,
     status);
 
 // Execute the plan
@@ -283,7 +283,7 @@ ET_CHECK_MSG(
     status);
 
 // Verify the result.
-status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
+status = executorch::bundled_program::verify_method_outputs(
         *method,
         program_data.bundled_program_data(),
         FLAGS_testset_idx,
diff --git a/docs/source/concepts.md b/docs/source/concepts.md
index c085505b61..289ecda6d8 100644
--- a/docs/source/concepts.md
+++ b/docs/source/concepts.md
@@ -26,7 +26,7 @@ The goal of ATen dialect is to capture users’ programs as faithfully as possib
 
 ## ATen mode
 
-ATen mode uses the ATen implementation of Tensor (`at::Tensor`) and related types, such as `ScalarType`, from the PyTorch core. This is in contrast to portable mode, which uses ExecuTorch’s smaller implementation of tensor (`torch::executor::Tensor`) and related types, such as `torch::executor::ScalarType`.
+ATen mode uses the ATen implementation of Tensor (`at::Tensor`) and related types, such as `ScalarType`, from the PyTorch core. This is in contrast to ETensor mode, which uses ExecuTorch’s smaller implementation of tensor (`executorch::runtime::etensor::Tensor`) and related types, such as `executorch::runtime::etensor::ScalarType`.
 - ATen kernels that rely on the full `at::Tensor` API are usable in this configuration.
 - ATen kernels tend to do dynamic memory allocation and often have extra flexibility (and thus overhead) to handle cases not needed by mobile/embedded clients. e.g.,  CUDA support, sparse tensor support, and dtype promotion.
 - Note: ATen mode is currently a WIP.
@@ -244,10 +244,10 @@ Kernels that support a subset of tensor dtypes and/or dim orders.
 
 Parts of a model may be delegated to run on an optimized backend. The partitioner splits the graph into the appropriate sub-networks and tags them for delegation.
 
-## Portable mode (lean mode)
+## ETensor mode
 
-Portable mode uses ExecuTorch’s smaller implementation of tensor (`torch::executor::Tensor`) along with related types (`torch::executor::ScalarType`, etc.). This is in contrast to ATen mode, which uses the ATen implementation of Tensor (`at::Tensor`) and related types (`ScalarType`, etc.)
-- `torch::executor::Tensor`, also known as ETensor, is a source-compatible subset of `at::Tensor`. Code written against ETensor can build against `at::Tensor`.
+ETensor mode uses ExecuTorch’s smaller implementation of tensor (`executorch::runtime::etensor::Tensor`) along with related types (`executorch::runtime::etensor::ScalarType`, etc.). This is in contrast to ATen mode, which uses the ATen implementation of Tensor (`at::Tensor`) and related types (`ScalarType`, etc.)
+- `executorch::runtime::etensor::Tensor`, also known as ETensor, is a source-compatible subset of `at::Tensor`. Code written against ETensor can build against `at::Tensor`.
 - ETensor does not own or allocate memory on its own. To support dynamic shapes, kernels can allocate Tensor data using the MemoryAllocator provided by the client.
 
 ## Portable kernels
diff --git a/docs/source/etdump.md b/docs/source/etdump.md
index 42391cf40e..13957f1c01 100644
--- a/docs/source/etdump.md
+++ b/docs/source/etdump.md
@@ -15,7 +15,7 @@ Generating an ETDump is a relatively straightforward process. Users can follow t
 2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime.
 
 ```C++
-torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+executorch::etdump::ETDumpGen etdump_gen;
 Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
 ```
diff --git a/docs/source/executorch-runtime-api-reference.rst b/docs/source/executorch-runtime-api-reference.rst
index ef9a99e593..5bec597987 100644
--- a/docs/source/executorch-runtime-api-reference.rst
+++ b/docs/source/executorch-runtime-api-reference.rst
@@ -11,25 +11,25 @@ For detailed information on how APIs evolve and the deprecation process, please
 Model Loading and Execution
 ---------------------------
 
-.. doxygenclass:: executorch::runtime::DataLoader
+.. doxygenclass:: executorch::runtime::Program
   :members:
 
-.. doxygenclass:: executorch::runtime::MemoryAllocator
+.. doxygenclass:: executorch::runtime::Method
   :members:
 
-.. doxygenclass:: executorch::runtime::HierarchicalAllocator
+.. doxygenclass:: executorch::runtime::MethodMeta
   :members:
 
-.. doxygenclass:: executorch::runtime::MemoryManager
+.. doxygenclass:: executorch::runtime::DataLoader
   :members:
 
-.. doxygenclass:: executorch::runtime::Program
+.. doxygenclass:: executorch::runtime::MemoryAllocator
   :members:
 
-.. doxygenclass:: executorch::runtime::Method
+.. doxygenclass:: executorch::runtime::HierarchicalAllocator
   :members:
 
-.. doxygenclass:: executorch::runtime::MethodMeta
+.. doxygenclass:: executorch::runtime::MemoryManager
   :members:
 
 Values
@@ -38,5 +38,5 @@ Values
 .. doxygenstruct:: executorch::runtime::EValue
   :members:
 
-.. doxygenclass:: executorch::aten::Tensor
+.. doxygenclass:: executorch::runtime::etensor::Tensor
   :members:
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index 71b13598ed..2cdf13ca65 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -43,14 +43,13 @@ cd et-nanogpt
 
 # Clone the ExecuTorch repository and submodules.
 mkdir third-party
-git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch
+git clone -b release/0.4 https://github.com/pytorch/executorch.git third-party/executorch
 cd third-party/executorch
 git submodule update --init
 
 # Create a conda environment and install requirements.
 conda create -yn executorch python=3.10.0
 conda activate executorch
-pip install cmake zstd
 ./install_requirements.sh
 
 cd ../..
@@ -77,12 +76,11 @@ pyenv activate executorch
 
 # Clone the ExecuTorch repository and submodules.
 mkdir third-party
-git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch
+git clone -b release/0.4 https://github.com/pytorch/executorch.git third-party/executorch
 cd third-party/executorch
 git submodule update --init
 
 # Install requirements.
-pip install cmake zstd
 PYTHON_EXECUTABLE=python ./install_requirements.sh
 
 cd ../..
@@ -208,8 +206,8 @@ Create a file called main.cpp with the following contents:
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/result.h>
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::extension::from_blob;
 using executorch::extension::Module;
 using executorch::runtime::EValue;
@@ -235,56 +233,56 @@ std::string generate(
     BasicSampler& sampler,
     size_t max_input_length,
     size_t max_output_length) {
+  // Convert the input text into a list of integers (tokens) that represents it,
+  // using the string-to-token mapping that the model was trained on. Each token
+  // is an integer that represents a word or part of a word.
+  std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
+  std::vector<int64_t> output_tokens;
+
+  for (auto i = 0u; i < max_output_length; i++) {
+    // Convert the input_tokens from a vector of int64_t to EValue. EValue is a
+    // unified data type in the ExecuTorch runtime.
+    auto inputs = from_blob(
+        input_tokens.data(),
+        {1, static_cast<int>(input_tokens.size())},
+        ScalarType::Long);
+
+    // Run the model. It will return a tensor of logits (log-probabilities).
+    auto logits_evalue = llm_model.forward(inputs);
+
+    // Convert the output logits from EValue to std::vector, which is what the
+    // sampler expects.
+    Tensor logits_tensor = logits_evalue.get()[0].toTensor();
+    std::vector<float> logits(
+        logits_tensor.data_ptr<float>(),
+        logits_tensor.data_ptr<float>() + logits_tensor.numel());
+
+    // Sample the next token from the logits.
+    int64_t next_token = sampler.sample(logits);
+
+    // Break if we reached the end of the text.
+    if (next_token == ENDOFTEXT_TOKEN) {
+      break;
+    }
+
+    // Add the next token to the output.
+    output_tokens.push_back(next_token);
+
+    std::cout << tokenizer.decode({next_token});
+    std::cout.flush();
 
-    // Convert the input text into a list of integers (tokens) that represents
-    // it, using the string-to-token mapping that the model was trained on.
-    // Each token is an integer that represents a word or part of a word.
-    std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
-    std::vector<int64_t> output_tokens;
-
-    for (auto i = 0u; i < max_output_length; i++) {
-        // Convert the input_tokens from a vector of int64_t to EValue.
-        // EValue is a unified data type in the ExecuTorch runtime.
-        auto inputs = from_blob(
-            input_tokens.data(),
-            {1, static_cast<int>(input_tokens.size())},
-            ScalarType::Long);
-
-        // Run the model. It will return a tensor of logits (log-probabilities).
-        auto logits_evalue = llm_model.forward(inputs);
-
-        // Convert the output logits from EValue to std::vector, which is what
-        // the sampler expects.
-        Tensor logits_tensor = logits_evalue.get()[0].toTensor();
-        std::vector<float> logits(logits_tensor.data_ptr<float>(),
-            logits_tensor.data_ptr<float>() + logits_tensor.numel());
-
-        // Sample the next token from the logits.
-        int64_t next_token = sampler.sample(logits);
-
-        // Break if we reached the end of the text.
-        if (next_token == ENDOFTEXT_TOKEN) {
-            break;
-        }
-
-        // Add the next token to the output.
-        output_tokens.push_back(next_token);
-
-        std::cout << tokenizer.decode({ next_token });
-        std::cout.flush();
-
-        // Update next input.
-        input_tokens.push_back(next_token);
-        if (input_tokens.size() > max_input_length) {
-            input_tokens.erase(input_tokens.begin());
-        }
+    // Update next input.
+    input_tokens.push_back(next_token);
+    if (input_tokens.size() > max_input_length) {
+      input_tokens.erase(input_tokens.begin());
     }
+  }
 
-    std::cout << std::endl;
+  std::cout << std::endl;
 
-    // Convert the output tokens into a human-readable string.
-    std::string output_string = tokenizer.decode(output_tokens);
-    return output_string;
+  // Convert the output tokens into a human-readable string.
+  std::string output_string = tokenizer.decode(output_tokens);
+  return output_string;
 }
 ```
 
@@ -309,32 +307,32 @@ penalties for repeated tokens, and biases to prioritize or de-prioritize specifi
 ```cpp
 // main.cpp
 
-using namespace torch::executor;
-
 int main() {
-    // Set up the prompt. This provides the seed text for the model to elaborate.
-    std::cout << "Enter model prompt: ";
-    std::string prompt;
-    std::getline(std::cin, prompt);
-
-    // The tokenizer is used to convert between tokens (used by the model) and
-    // human-readable strings.
-    BasicTokenizer tokenizer("vocab.json");
-
-    // The sampler is used to sample the next token from the logits.
-    BasicSampler sampler = BasicSampler();
-
-    // Load the exported nanoGPT program, which was generated via the previous steps.
-    Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
-
-    const auto max_input_tokens = 1024;
-    const auto max_output_tokens = 30;
-    std::cout << prompt;
-    generate(model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
+  // Set up the prompt. This provides the seed text for the model to elaborate.
+  std::cout << "Enter model prompt: ";
+  std::string prompt;
+  std::getline(std::cin, prompt);
+
+  // The tokenizer is used to convert between tokens (used by the model) and
+  // human-readable strings.
+  BasicTokenizer tokenizer("vocab.json");
+
+  // The sampler is used to sample the next token from the logits.
+  BasicSampler sampler = BasicSampler();
+
+  // Load the exported nanoGPT program, which was generated via the previous
+  // steps.
+  Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
+
+  const auto max_input_tokens = 1024;
+  const auto max_output_tokens = 30;
+  std::cout << prompt;
+  generate(
+      model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
 }
 ```
 
-Finally, download the following files into the same directory as main.h:
+Finally, download the following files into the same directory as main.cpp:
 
 ```
 curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h
@@ -368,17 +366,19 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 
 # Include the executorch subdirectory.
 add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
-    ${CMAKE_BINARY_DIR}/third-party/executorch)
+  ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+  ${CMAKE_BINARY_DIR}/executorch
+)
 
 add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
-    nanogpt_runner
-    PRIVATE
-    executorch
-    extension_module_static # Provides the Module class
-    extension_tensor # Provides the TensorPtr class
-    optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels
+  nanogpt_runner
+  PRIVATE executorch
+          extension_module_static # Provides the Module class
+          extension_tensor # Provides the TensorPtr class
+          optimized_native_cpu_ops_lib # Provides baseline cross-platform
+                                       # kernels
+)
 ```
 
 At this point, the working directory should contain the following files:
@@ -524,20 +524,20 @@ option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
 # Include the executorch subdirectory.
 add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
-    ${CMAKE_BINARY_DIR}/executorch)
-
-# include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
+  ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+  ${CMAKE_BINARY_DIR}/executorch
+)
 
 add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
-    nanogpt_runner
-    PRIVATE
-    executorch
-    extension_module_static # Provides the Module class
-    extension_tensor # Provides the TensorPtr class
-    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
-    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
+  nanogpt_runner
+  PRIVATE executorch
+          extension_module_static # Provides the Module class
+          extension_tensor # Provides the TensorPtr class
+          optimized_native_cpu_ops_lib # Provides baseline cross-platform
+                                       # kernels
+          xnnpack_backend # Provides the XNNPACK CPU acceleration backend
+)
 ```
 
 Keep the rest of the code the same. For more details refer to [Exporting
diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md
index edd5d95086..70e17c8222 100644
--- a/docs/source/running-a-model-cpp-tutorial.md
+++ b/docs/source/running-a-model-cpp-tutorial.md
@@ -24,14 +24,25 @@ Users can define their own `DataLoader`s to fit the needs of their particular sy
 For the `FileDataLoader` all we need to do is provide a file path to the constructor.
 
 ``` cpp
-using namespace torch::executor;
-
-Result<util::FileDataLoader> loader =
-        util::FileDataLoader::from("/tmp/model.pte");
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::FileDataLoader;
+using executorch::extension::MallocMemoryAllocator;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+Result<FileDataLoader> loader =
+        FileDataLoader::from("/tmp/model.pte");
 assert(loader.ok());
 
-Result<Program> program =
-      torch::executor::Program::load(&loader.get());
+Result<Program> program = Program::load(&loader.get());
 assert(program.ok());
 ```
 
@@ -48,14 +59,13 @@ One of the principles of ExecuTorch is giving users control over where the memor
 For this example we will retrieve the size of the planned memory arenas dynamically from the `Program`, but for heapless environments users could retrieve this information from the `Program` ahead of time and allocate the arena statically. We will also be using a malloc based allocator for the method allocator.
 
 ``` cpp
-
-// Method names map back to Python nn.Module method names. Most users will only have the singular method "forward".
+// Method names map back to Python nn.Module method names. Most users will only
+// have the singular method "forward".
 const char* method_name = "forward";
 
 // MethodMeta is a lightweight structure that lets us gather metadata
-// information about a specific method. In this case we are looking to
-// get the required size of the memory planned buffers for the method
-// "forward".
+// information about a specific method. In this case we are looking to get the
+// required size of the memory planned buffers for the method "forward".
 Result<MethodMeta> method_meta = program->method_meta(method_name);
 assert(method_meta.ok());
 
@@ -64,7 +74,8 @@ std::vector<Span<uint8_t>> planned_arenas; // Passed to the allocator
 
 size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
 
-// It is possible to have multiple layers in our memory hierarchy; for example, SRAM and DRAM.
+// It is possible to have multiple layers in our memory hierarchy; for example,
+// SRAM and DRAM.
 for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
   // .get() will always succeed because id < num_memory_planned_buffers.
   size_t buffer_size =
@@ -75,12 +86,12 @@ for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
 HierarchicalAllocator planned_memory(
     {planned_arenas.data(), planned_arenas.size()});
 
-// Version of MemoryAllocator that uses malloc to handle allocations
-// rather then a fixed buffer.
-util::MallocMemoryAllocator method_allocator;
+// Version of MemoryAllocator that uses malloc to handle allocations rather then
+// a fixed buffer.
+MallocMemoryAllocator method_allocator;
 
-// Assemble all of the allocators into the MemoryManager that the Executor
-// will use.
+// Assemble all of the allocators into the MemoryManager that the Executor will
+// use.
 MemoryManager memory_manager(&method_allocator, &planned_memory);
 ```
 
diff --git a/examples/llm_manual/CMakeLists.txt b/examples/llm_manual/CMakeLists.txt
index e5054a683a..1283eb548e 100644
--- a/examples/llm_manual/CMakeLists.txt
+++ b/examples/llm_manual/CMakeLists.txt
@@ -23,8 +23,6 @@ add_subdirectory(
   ${CMAKE_BINARY_DIR}/executorch
 )
 
-# include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
-
 add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
   nanogpt_runner
@@ -33,5 +31,5 @@ target_link_libraries(
           extension_tensor # Provides the TensorPtr class
           optimized_native_cpu_ops_lib # Provides baseline cross-platform
                                        # kernels
-          xnnpack_backend
-) # Provides the XNNPACK CPU acceleration backend
+          xnnpack_backend # Provides the XNNPACK CPU acceleration backend
+)
diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp
index 5bd318983b..76492513f9 100644
--- a/examples/llm_manual/main.cpp
+++ b/examples/llm_manual/main.cpp
@@ -17,15 +17,15 @@
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/result.h>
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::extension::from_blob;
 using executorch::extension::Module;
 using executorch::runtime::EValue;
 using executorch::runtime::Result;
 
 // The value of the gpt2 `<|endoftext|>` token.
-#define ENDOFTEXT 50256
+#define ENDOFTEXT_TOKEN 50256
 
 std::string generate(
     Module& llm_model,
@@ -34,15 +34,15 @@ std::string generate(
     BasicSampler& sampler,
     size_t max_input_length,
     size_t max_output_length) {
-  // Convert the input text into a list of integers (tokens) that represents
-  // it, using the string-to-token mapping that the model was trained on.
-  // Each token is an integer that represents a word or part of a word.
+  // Convert the input text into a list of integers (tokens) that represents it,
+  // using the string-to-token mapping that the model was trained on. Each token
+  // is an integer that represents a word or part of a word.
   std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
   std::vector<int64_t> output_tokens;
 
   for (auto i = 0u; i < max_output_length; i++) {
-    // Convert the input_tokens from a vector of int64_t to EValue.
-    // EValue is a unified data type in the ExecuTorch runtime.
+    // Convert the input_tokens from a vector of int64_t to EValue. EValue is a
+    // unified data type in the ExecuTorch runtime.
     auto inputs = from_blob(
         input_tokens.data(),
         {1, static_cast<int>(input_tokens.size())},
@@ -51,8 +51,8 @@ std::string generate(
     // Run the model. It will return a tensor of logits (log-probabilities).
     auto logits_evalue = llm_model.forward(inputs);
 
-    // Convert the output logits from EValue to std::vector, which is what
-    // the sampler expects.
+    // Convert the output logits from EValue to std::vector, which is what the
+    // sampler expects.
     Tensor logits_tensor = logits_evalue.get()[0].toTensor();
     std::vector<float> logits(
         logits_tensor.data_ptr<float>(),
@@ -62,7 +62,7 @@ std::string generate(
     int64_t next_token = sampler.sample(logits);
 
     // Break if we reached the end of the text.
-    if (next_token == ENDOFTEXT) {
+    if (next_token == ENDOFTEXT_TOKEN) {
       break;
     }
 
@@ -86,11 +86,9 @@ std::string generate(
   return output_string;
 }
 
-// main.cpp
-
 int main() {
   // Set up the prompt. This provides the seed text for the model to elaborate.
-  std::cout << "Prompt: ";
+  std::cout << "Enter model prompt: ";
   std::string prompt;
   std::getline(std::cin, prompt);