FMInference
diff --git a/‎.gitignore
+6-6 b/‎.gitignore
+6-6
diff --git a/‎LICENSE
+1-1 b/‎LICENSE
+1-1
diff --git a/‎README.md
+43-43 b/‎README.md
+43-43
diff --git a/‎benchmark/batch_size_table.md
+4-4 b/‎benchmark/batch_size_table.md
+4-4
diff --git a/‎benchmark/flexgen/README.md ‎benchmark/flexllmgen/README.md
+2-2 b/‎benchmark/flexgen/README.md ‎benchmark/flexllmgen/README.md
+2-2
diff --git a/‎benchmark/flexgen/bench_175b_1x4.sh ‎benchmark/flexllmgen/bench_175b_1x4.sh
+1-1 b/‎benchmark/flexgen/bench_175b_1x4.sh ‎benchmark/flexllmgen/bench_175b_1x4.sh
+1-1
diff --git a/‎benchmark/flexgen/bench_175b_4x1.sh ‎benchmark/flexllmgen/bench_175b_4x1.sh
+1-1 b/‎benchmark/flexgen/bench_175b_4x1.sh ‎benchmark/flexllmgen/bench_175b_4x1.sh
+1-1
diff --git a/‎benchmark/flexgen/bench_30b_1x4.sh ‎benchmark/flexllmgen/bench_30b_1x4.sh
+1-1 b/‎benchmark/flexgen/bench_30b_1x4.sh ‎benchmark/flexllmgen/bench_30b_1x4.sh
+1-1
diff --git a/‎benchmark/flexgen/bench_30b_4x1.sh ‎benchmark/flexllmgen/bench_30b_4x1.sh
+1-1 b/‎benchmark/flexgen/bench_30b_4x1.sh ‎benchmark/flexllmgen/bench_30b_4x1.sh
+1-1
diff --git a/‎benchmark/flexgen/bench_6.7b_1x4.sh ‎benchmark/flexllmgen/bench_6.7b_1x4.sh
+1-1 b/‎benchmark/flexgen/bench_6.7b_1x4.sh ‎benchmark/flexllmgen/bench_6.7b_1x4.sh
+1-1
diff --git a/‎benchmark/flexgen/bench_6.7b_4x1.sh ‎benchmark/flexllmgen/bench_6.7b_4x1.sh
+1-1 b/‎benchmark/flexgen/bench_6.7b_4x1.sh ‎benchmark/flexllmgen/bench_6.7b_4x1.sh
+1-1
diff --git a/‎benchmark/flexgen/bench_dist_multi_node.sh ‎benchmark/flexllmgen/bench_dist_multi_node.sh
+1-1 b/‎benchmark/flexgen/bench_dist_multi_node.sh ‎benchmark/flexllmgen/bench_dist_multi_node.sh
+1-1
diff --git a/‎benchmark/flexgen/bench_dist_single_node.sh ‎benchmark/flexllmgen/bench_dist_single_node.sh
+1-1 b/‎benchmark/flexgen/bench_dist_single_node.sh ‎benchmark/flexllmgen/bench_dist_single_node.sh
+1-1
diff --git a/‎benchmark/flexgen/bench_suite.py ‎benchmark/flexllmgen/bench_suite.py
+2-2 b/‎benchmark/flexgen/bench_suite.py ‎benchmark/flexllmgen/bench_suite.py
+2-2
diff --git a/‎benchmark/hf_ds/bench_hf.py
+1-1 b/‎benchmark/hf_ds/bench_hf.py
+1-1
@@ -16,12 +16,12 @@ dist
 # cache
 *__pycache__
 *.egg-info
-flexgen/apps/data
-flexgen/apps/runs
-flexgen/apps/benchmark_output
-flexgen/apps/data_wrangle/data
-flexgen/apps/data_wrangle/outputs
-flexgen/apps/data_wrangle/core
+flexllmgen/apps/data
+flexllmgen/apps/runs
+flexllmgen/apps/benchmark_output
+flexllmgen/apps/data_wrangle/data
+flexllmgen/apps/data_wrangle/outputs
+flexllmgen/apps/data_wrangle/core
 
 # pickle
 *.pkl
 
@@ -1,4 +1,4 @@
-Copyright 2023 - The FlexGen team. All rights reserved.
+Copyright 2023 - The FlexLLMGen team. All rights reserved.
 
                                   Apache License
                            Version 2.0, January 2004
 
@@ -1,6 +1,6 @@
-# FlexGen: High-throughput Generative Inference of Large Language Models with a Single GPU [[paper](https://arxiv.org/abs/2303.06865)]
+# FlexLLMGen: High-throughput Generative Inference of Large Language Models with a Single GPU [[paper](https://arxiv.org/abs/2303.06865)]
 
-FlexGen is a high-throughput generation engine for running large language models with limited GPU memory. FlexGen allows **high-throughput** generation by IO-efficient offloading, compression, and **large effective batch sizes**.
+FlexLLMGen is a high-throughput generation engine for running large language models with limited GPU memory. FlexLLMGen allows **high-throughput** generation by IO-efficient offloading, compression, and **large effective batch sizes**.
 
 ## Motivation
 
@@ -18,15 +18,15 @@ Throughput is a measure of tokens processed per second over the job's entire run
 Throughput-oriented workloads provide opportunities to trade off latency for higher throughput, which
 makes it easier to take advantage of low-cost commodity GPUs. 
 
-The goal of FlexGen is to create a high-throughput system to enable new and exciting applications of 
+The goal of FlexLLMGen is to create a high-throughput system to enable new and exciting applications of 
 foundation models to throughput-oriented tasks on low-cost hardware, such as a single commodity GPU
 instead of expensive systems.
 
-Check out the [examples](#examples) of what you can run on a single commodity GPU with FlexGen, including benchmarking and data wrangling.
+Check out the [examples](#examples) of what you can run on a single commodity GPU with FlexLLMGen, including benchmarking and data wrangling.
 
-❌ **Limitation**. As an offloading-based system running on weak GPUs, FlexGen also has its limitations.
-FlexGen can be significantly slower than the case when you have enough powerful GPUs to hold the whole model, especially for small-batch cases.
-FlexGen is mostly optimized for throughput-oriented batch processing settings (e.g., classifying or extracting information from many documents in batches), on single GPUs.
+❌ **Limitation**. As an offloading-based system running on weak GPUs, FlexLLMGen also has its limitations.
+FlexLLMGen can be significantly slower than the case when you have enough powerful GPUs to hold the whole model, especially for small-batch cases.
+FlexLLMGen is mostly optimized for throughput-oriented batch processing settings (e.g., classifying or extracting information from many documents in batches), on single GPUs.
 
 ----------
 
@@ -45,8 +45,8 @@ This project was made possible thanks to a collaboration with
 - [Installation](#installation)
 - [Usage and Examples](#usage-and-examples)
   - [Get Started with a Single GPU](#get-started-with-a-single-gpu)
-  - [Run HELM Benchmark with FlexGen](#run-helm-benchmark-with-flexgen)
-  - [Run Data Wrangling Tasks with FlexGen](#run-data-wrangling-tasks-with-flexgen)
+  - [Run HELM Benchmark with FlexLLMGen](#run-helm-benchmark-with-flexllmgen)
+  - [Run Data Wrangling Tasks with FlexLLMGen](#run-data-wrangling-tasks-with-flexllmgen)
   - [Scaling to Distributed GPUs](#scaling-to-distributed-gpus)
   - [API Example](#api-example)
   - [Frequently Asked Questions](#frequently-asked-questions)
@@ -60,13 +60,13 @@ Requirements:
 
 ### Method 1: With pip
 ```
-pip install flexgen
+pip install flexllmgen
 ```
 
 ### Method 2: From source
 ```
-git clone https://github.com/FMInference/FlexGen.git
-cd FlexGen
+git clone https://github.com/FMInference/FlexLLMGen.git
+cd FlexLLMGen
 pip install -e .
 ```
 
@@ -76,53 +76,53 @@ pip install -e .
 
 #### OPT-1.3B
 To get started, you can try a small model like OPT-1.3B first. It fits into a single GPU so no offloading is required.
-FlexGen will automatically download weights from Hugging Face.
+FlexLLMGen will automatically download weights from Hugging Face.
 ```
-python3 -m flexgen.flex_opt --model facebook/opt-1.3b
+python3 -m flexllmgen.flex_opt --model facebook/opt-1.3b
 ```
 
 You should see some text generated by OPT-1.3B and the benchmark results.
 
 #### OPT-30B
 To run large models like OPT-30B, you will need to use CPU offloading. You can try commands below.
 The `--percent` argument specifies the offloading strategy for parameters, attention cache and hidden states separately.
-The exact meaning of this argument can be found [here](https://github.com/FMInference/FlexGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/flexgen/flex_opt.py#L1271-L1279).
+The exact meaning of this argument can be found [here](https://github.com/FMInference/FlexLLMGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/flexllmgen/flex_opt.py#L1271-L1279).
 ```
-python3 -m flexgen.flex_opt --model facebook/opt-30b --percent 0 100 100 0 100 0
+python3 -m flexllmgen.flex_opt --model facebook/opt-30b --percent 0 100 100 0 100 0
 ```
 
 #### OPT-175B
 To run OPT-175B, you need to download the weights from [metaseq](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT) and convert the weights into Alpa [format](https://alpa.ai/tutorials/opt_serving.html#convert-opt-175b-weights-into-alpa-formats).
 You can then try to offloading all weights to disk by
 ```
-python3 -m flexgen.flex_opt --model facebook/opt-175b --percent 0 0 100 0 100 0 --offload-dir YOUR_SSD_FOLDER
+python3 -m flexllmgen.flex_opt --model facebook/opt-175b --percent 0 0 100 0 100 0 --offload-dir YOUR_SSD_FOLDER
 ```
 
-### Run HELM Benchmark with FlexGen
-FlexGen can be integrated into [HELM](https://crfm.stanford.edu/helm), a language model benchmark framework, as its execution backend.
+### Run HELM Benchmark with FlexLLMGen
+FlexLLMGen can be integrated into [HELM](https://crfm.stanford.edu/helm), a language model benchmark framework, as its execution backend.
 You can use the commands below to run a Massive Multitask Language Understanding (MMLU) [scenario](https://crfm.stanford.edu/helm/latest/?group=mmlu) with a single T4 (16GB) GPU and 200GB of DRAM.
 ```
 pip install crfm-helm
-python3 -m flexgen.apps.helm_run --description mmlu:model=text,subject=abstract_algebra,data_augmentation=canonical --pad-to-seq-len 512 --model facebook/opt-30b --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --max-eval-instance 100
+python3 -m flexllmgen.apps.helm_run --description mmlu:model=text,subject=abstract_algebra,data_augmentation=canonical --pad-to-seq-len 512 --model facebook/opt-30b --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --max-eval-instance 100
 ```
-Note that only a subset of HELM scenarios is tested. See more tested scenarios [here](flexgen/apps/helm_passed_30b.sh).
+Note that only a subset of HELM scenarios is tested. See more tested scenarios [here](flexllmgen/apps/helm_passed_30b.sh).
 
-### Run Data Wrangling Tasks with FlexGen
-You can run the examples in this paper, ['Can Foundation Models Wrangle Your Data?'](https://arxiv.org/abs/2205.09911), by following the instructions [here](flexgen/apps/data_wrangle).
+### Run Data Wrangling Tasks with FlexLLMGen
+You can run the examples in this paper, ['Can Foundation Models Wrangle Your Data?'](https://arxiv.org/abs/2205.09911), by following the instructions [here](flexllmgen/apps/data_wrangle).
 
 ### Scaling to Distributed GPUs
-If you have multiple machines with GPUs, FlexGen can combine offloading with pipeline parallelism to allow scaling.
-For example, if you have 2 GPUs but the aggregated GPU memory is less than the model size, you still need offloading. FlexGen allow you to do pipeline parallelism with these 2 GPUs to accelerate the generation.
+If you have multiple machines with GPUs, FlexLLMGen can combine offloading with pipeline parallelism to allow scaling.
+For example, if you have 2 GPUs but the aggregated GPU memory is less than the model size, you still need offloading. FlexLLMGen allow you to do pipeline parallelism with these 2 GPUs to accelerate the generation.
 But to have scaled performance, you should have GPUs on distributed machines.
-See examples [here](https://github.com/FMInference/FlexGen/tree/main/benchmark/flexgen#distributed-gpus).
+See examples [here](https://github.com/FMInference/FlexLLMGen/tree/main/benchmark/flexllmgen#distributed-gpus).
 
 ### API Example
-We demonstrate the usage of FlexGen API in [completion.py](flexgen/apps/completion.py).
+We demonstrate the usage of FlexLLMGen API in [completion.py](flexllmgen/apps/completion.py).
 This example shows how to run generation for two sentences.
-To get the best throughput out of FlexGen, you typically need to batch more sentences.
+To get the best throughput out of FlexLLMGen, you typically need to batch more sentences.
 
 #### Generation API
-FlexGen has a generation API following the style of Hugging Face's transformers.
+FlexLLMGen has a generation API following the style of Hugging Face's transformers.
 ```python
 output_ids = model.generate(
 	input_ids,
@@ -138,25 +138,25 @@ If you do not have enough GPU/CPU memory, see the [Handle Out-Of-Memory](#handle
 
 ```
 # Complete with OPT-6.7B. You need at least 15GB of GPU memory.
-python3 -m flexgen.apps.completion --model facebook/opt-6.7b
+python3 -m flexllmgen.apps.completion --model facebook/opt-6.7b
 ```
 
 ```
 # Complete with OPT-30B. You need about 90GB of CPU memory.
-python3 -m flexgen.apps.completion --model facebook/opt-30b --percent 0 100 100 0 100 0
+python3 -m flexllmgen.apps.completion --model facebook/opt-30b --percent 0 100 100 0 100 0
 ```
 
 ```
 # Complete with instruction-tuned OPT-IML-MAX-30B. You need about 90GB of CPU memory.
-python3 -m flexgen.apps.completion --model facebook/opt-iml-max-30b --percent 0 100 100 0 100 0
+python3 -m flexllmgen.apps.completion --model facebook/opt-iml-max-30b --percent 0 100 100 0 100 0
 ```
 
 ### Frequently Asked Questions
 
 #### How to set the offloading strategy and `--percent`?
 We will release an automatic policy optimizer later, but now you have to manually try a few strategies.
 The idea of high-throughput generation is to offload parameters and attention cache as much as possible to the CPU and disk if necessary.
-You can see the reference strategies in our benchmark [here](https://github.com/FMInference/FlexGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/benchmark/flexgen/bench_suite.py#L39-L79).
+You can see the reference strategies in our benchmark [here](https://github.com/FMInference/FlexLLMGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/benchmark/flexllmgen/bench_suite.py#L39-L79).
 To avoid out-of-memory, you can tune the `--percent` to offload more tensors to the CPU and disk.
 
 
@@ -176,31 +176,31 @@ The corresponding effective batch sizes and lowest offloading devices are in par
 | Hugging Face Accelerate  | 25.12 (2 on GPU)  | 0.62 (8 on CPU) | 0.01 (2 on disk) |
 | DeepSpeed ZeRO-Inference | 9.28 (16 on CPU)  | 0.60 (4 on CPU) | 0.01 (1 on disk) |
 | Petals                 | 8.25 (2 on GPU) | 2.84 (2 on GPU) | 0.08 (2 on GPU) |
-| FlexGen                  | 25.26 (2 on GPU) | 7.32 (144 on CPU) | 0.69 (256 on disk) |
-| FlexGen with Compression | **29.12** (72 on GPU) | **8.38** (512 on CPU) | **1.12** (144 on CPU) |
+| FlexLLMGen                  | 25.26 (2 on GPU) | 7.32 (144 on CPU) | 0.69 (256 on disk) |
+| FlexLLMGen with Compression | **29.12** (72 on GPU) | **8.38** (512 on CPU) | **1.12** (144 on CPU) |
 
 - Hardware: an NVIDIA T4 (16GB) instance on GCP with 208GB of DRAM and 1.5TB of SSD.  
 - Workload: input sequence length = 512, output sequence length = 32. The batch size is tuned to **a large value** that maximizes the generation throughput for each system.
 - Metric: generation throughput (token/s) = number of the generated tokens / (time for processing prompts + time for generation).  
 
-How to [reproduce](benchmark/flexgen).
+How to [reproduce](benchmark/flexllmgen).
 
 ### Latency-Throughput Trade-Off
 The figure below shows the latency and throughput trade-off of three offloading-based systems on OPT-175B (left) and OPT-30B (right).
-FlexGen achieves a new Pareto-optimal frontier with significantly higher maximum throughput for both models.
+FlexLLMGen achieves a new Pareto-optimal frontier with significantly higher maximum throughput for both models.
 Other systems cannot further increase throughput due to out-of-memory.
-"FlexGen(c)" is FlexGen with compression.
+"FlexLLMGen(c)" is FlexLLMGen with compression.
 
-<img src="https://github.com/FMInference/FlexGen/blob/main/docs/throughput_vs_latency.jpg" alt="image" width="500"></img>
+<img src="https://github.com/FMInference/FlexLLMGen/blob/main/docs/throughput_vs_latency.jpg" alt="image" width="500"></img>
 
 ## How It Works
-FlexGen can be flexibly configured under various hardware resource constraints by aggregating memory and computation from the GPU, CPU, and disk. Through a linear programming optimizer, it searches for the best pattern to store and access the tensors, including weights, activations, and attention key/value (KV) cache. FlexGen further compresses both weights and KV cache to 4 bits with negligible accuracy loss.
+FlexLLMGen can be flexibly configured under various hardware resource constraints by aggregating memory and computation from the GPU, CPU, and disk. Through a linear programming optimizer, it searches for the best pattern to store and access the tensors, including weights, activations, and attention key/value (KV) cache. FlexLLMGen further compresses both weights and KV cache to 4 bits with negligible accuracy loss.
 
-One key idea of FlexGen is to play the latency-throughput trade-off. Achieving low latency is inherently challenging for offloading methods,
+One key idea of FlexLLMGen is to play the latency-throughput trade-off. Achieving low latency is inherently challenging for offloading methods,
 but the I/O efficiency of offloading can be greatly boosted for throughput-oriented scenarios (see the figure above).
-FlexGen utilizes a block schedule to reuse weight and overlap I/O with computation, as shown in figure (b) below, while other baseline systems use an inefficient row-by-row schedule, as shown in figure (a) below.
+FlexLLMGen utilizes a block schedule to reuse weight and overlap I/O with computation, as shown in figure (b) below, while other baseline systems use an inefficient row-by-row schedule, as shown in figure (a) below.
 
-<img src="https://github.com/FMInference/FlexGen/raw/main/docs/block_schedule.jpg" alt="image" width="500"></img>
+<img src="https://github.com/FMInference/FlexLLMGen/raw/main/docs/block_schedule.jpg" alt="image" width="500"></img>
 
 More technical details see our [paper](https://arxiv.org/abs/2303.06865).
 
 
@@ -16,8 +16,8 @@ The batch size is tuned for each system to achieve its maximum throughput with t
 | ------ | -------- | ------- | -------- |
 | Hugging Face Accelerate  | 2  (gpu) | 8 (cpu)   | 2 (disk)   |
 | DeepSpeed ZeRO-Inference | 16 (cpu) | 4 (cpu)   | 1 (disk)   |
-| FlexGen                  | 2  (gpu) | 144 (cpu) | 256 (disk) |
-| FlexGen with Compression | 72 (gpu) | 512 (cpu) | 144 (cpu)  |
+| FlexLLMGen                  | 2  (gpu) | 144 (cpu) | 256 (disk) |
+| FlexLLMGen with Compression | 72 (gpu) | 512 (cpu) | 144 (cpu)  |
 
 ### Generation Throughput (token/s)
 We attach the generation throughput here for reference.
@@ -26,8 +26,8 @@ We attach the generation throughput here for reference.
 | ------ | -------- | ------- | -------- |
 | Hugging Face Accelerate   | 25.12 | 0.62 | 0.01 |
 | DeepSpeed ZeRO-Inference | 9.28  | 0.60 | 0.01 |
-| FlexGen                  | 25.26 | 7.32 | 0.69 |
-| FlexGen with Compression | **29.12** | **8.38** | **1.12** |
+| FlexLLMGen                  | 25.26 | 7.32 | 0.69 |
+| FlexLLMGen with Compression | **29.12** | **8.38** | **1.12** |
 
 ### About Petals
 We also include [Petals](https://arxiv.org/abs/2209.01188) as an additional baseline.
 
@@ -1,9 +1,9 @@
-# Benchmark FlexGen
+# Benchmark FlexLLMGen
 NOTE: This benchmark uses dummy weights by default for faster experiments.
 It is expected if you see randomly generated garbled characters, but the throughput and latency numbers should be correct.
 
 ## Mount SSD
-The following commands use `~/flexgen_offload_dir` as the offloading folder by default.
+The following commands use `~/flexllmgen_offload_dir` as the offloading folder by default.
 To get the best performance, it is recommonded to mount this folder on a fast SSD.
 If you use AWS or GCP instances with local SSDs, you can use [mount_nvme_aws.sh](../../scripts/mount_nvme_aws.sh) or [mount_nvme_gcp.sh](../../scripts/mount_nvme_gcp.sh) to mount the local SSDs.
 
 
@@ -6,7 +6,7 @@ N_GPUS=4
 N_CORES_PER_GPU=12
 
 PYTHON_EXEC=$CONDA_PREFIX/bin/python
-PYTHON_SCRIPT=flexgen.dist_flex_opt
+PYTHON_SCRIPT=flexllmgen.dist_flex_opt
 
 pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
 
 
@@ -17,7 +17,7 @@ ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
 all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
 
 PYTHON_EXEC=$CONDA_PREFIX/bin/python
-PYTHON_SCRIPT=flexgen.dist_flex_opt
+PYTHON_SCRIPT=flexllmgen.dist_flex_opt
 
 pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
 
 
@@ -6,7 +6,7 @@ N_GPUS=4
 N_CORES_PER_GPU=12
 
 PYTHON_EXEC=$CONDA_PREFIX/bin/python
-PYTHON_SCRIPT=flexgen.dist_flex_opt
+PYTHON_SCRIPT=flexllmgen.dist_flex_opt
 
 pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
 
 
@@ -17,7 +17,7 @@ ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
 all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
 
 PYTHON_EXEC=$CONDA_PREFIX/bin/python
-PYTHON_SCRIPT=flexgen.dist_flex_opt
+PYTHON_SCRIPT=flexllmgen.dist_flex_opt
 
 pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
 
 
@@ -6,7 +6,7 @@ N_GPUS=4
 N_CORES_PER_GPU=6
 
 PYTHON_EXEC=$CONDA_PREFIX/bin/python
-PYTHON_SCRIPT=flexgen.dist_flex_opt
+PYTHON_SCRIPT=flexllmgen.dist_flex_opt
 
 pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
 
 
@@ -17,7 +17,7 @@ ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
 all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
 
 PYTHON_EXEC=$CONDA_PREFIX/bin/python
-PYTHON_SCRIPT=flexgen.dist_flex_opt
+PYTHON_SCRIPT=flexllmgen.dist_flex_opt
 
 pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
 
 
@@ -17,7 +17,7 @@ ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
 all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
 
 PYTHON_EXEC=$CONDA_PREFIX/bin/python
-PYTHON_SCRIPT=flexgen.dist_flex_opt
+PYTHON_SCRIPT=flexllmgen.dist_flex_opt
 
 pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
 
 
@@ -6,7 +6,7 @@ N_GPUS=4
 N_CORES_PER_GPU=4
 
 PYTHON_EXEC=$CONDA_PREFIX/bin/python
-PYTHON_SCRIPT=flexgen.dist_flex_opt
+PYTHON_SCRIPT=flexllmgen.dist_flex_opt
 
 pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
 
 
@@ -1,7 +1,7 @@
 import argparse
 from dataclasses import dataclass
 
-from flexgen.utils import run_cmd
+from flexllmgen.utils import run_cmd
 
 
 @dataclass
@@ -188,7 +188,7 @@ class Case:
         cases = suites[suite]
         for case in cases:
             config, name, use_page_maga = case.command, case.name, case.use_page_maga
-            cmd = f"python -m flexgen.flex_opt {config}"
+            cmd = f"python -m flexllmgen.flex_opt {config}"
             if log_file:
                 cmd += f" --log-file {args.log_file}"
             if use_page_maga:
 
@@ -2,7 +2,7 @@
 from dataclasses import dataclass
 import time
 
-from flexgen.utils import run_cmd
+from flexllmgen.utils import run_cmd
 
 
 def run_huggingface(model, prompt_len, gen_len, cut_gen_len, batch_size,
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Copyright 2023 - The FlexGen team. All rights reserved.`
	`1`	`+Copyright 2023 - The FlexLLMGen team. All rights reserved.`
`2`	`2`
`3`	`3`	`Apache License`
`4`	`4`	`Version 2.0, January 2004`