From 12dd715807ccbd7fafbb64d42571792db1cc6497 Mon Sep 17 00:00:00 2001 From: William Lin Date: Fri, 6 Sep 2024 17:48:48 -0700 Subject: [PATCH] [misc] [doc] [frontend] LLM torch profiler support (#7943) --- docs/source/dev/profiling/profiling_index.rst | 20 +++++++++-- examples/offline_inference_with_profiler.py | 33 +++++++++++++++++++ vllm/engine/llm_engine.py | 6 ++++ vllm/entrypoints/llm.py | 6 ++++ vllm/executor/cpu_executor.py | 6 ++++ vllm/executor/gpu_executor.py | 6 ++++ 6 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 examples/offline_inference_with_profiler.py diff --git a/docs/source/dev/profiling/profiling_index.rst b/docs/source/dev/profiling/profiling_index.rst index af3c78c3b5a5..e22d54729344 100644 --- a/docs/source/dev/profiling/profiling_index.rst +++ b/docs/source/dev/profiling/profiling_index.rst @@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/. .. tip:: Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. - -Example commands: + +.. tip:: + + To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. + Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes. + ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000`` + +Example commands and usage: +=========================== + +Offline Inference: +------------------ + +Refer to `examples/offline_inference_with_profiler.py `_ for an example. + OpenAI Server: +-------------- .. code-block:: bash - VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B + VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B benchmark_serving.py: diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py new file mode 100644 index 000000000000..906c9502800d --- /dev/null +++ b/examples/offline_inference_with_profiler.py @@ -0,0 +1,33 @@ +import os + +from vllm import LLM, SamplingParams + +# enable torch profiler, can also be set on cmd line +os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile" + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="facebook/opt-125m") + +llm.start_profile() + +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) + +llm.stop_profile() + +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 50dcb6937eb6..78ddcd1daaf6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1914,6 +1914,12 @@ def check_health(self) -> None: self.tokenizer.check_health() self.model_executor.check_health() + def start_profile(self) -> None: + self.model_executor.start_profile() + + def stop_profile(self) -> None: + self.model_executor.stop_profile() + def is_tracing_enabled(self) -> bool: return self.tracer is not None diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f587ec300314..1e4432eaaa66 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -560,6 +560,12 @@ def encode( outputs = self._run_engine(use_tqdm=use_tqdm) return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput) + def start_profile(self) -> None: + self.llm_engine.start_profile() + + def stop_profile(self) -> None: + self.llm_engine.stop_profile() + # LEGACY def _convert_v1_inputs( self, diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 21ad43f64168..ec9b24ce1318 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -296,6 +296,12 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: for result in parallel_worker_tasks: result.get() + def start_profile(self) -> None: + self.driver_method_invoker(self.driver_worker, "start_profile") + + def stop_profile(self) -> None: + self.driver_method_invoker(self.driver_worker, "stop_profile") + class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 947776e5d6ef..2185c9cf6cea 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -169,6 +169,12 @@ def check_health(self) -> None: # it's running. return + def start_profile(self) -> None: + self.driver_worker.start_profile() + + def stop_profile(self) -> None: + self.driver_worker.stop_profile() + class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):