diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index b2e910e1ba8a7..a67fc89d54e60 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do done lm_eval --model hf \ - --model_args pretrained=$MODEL,parallelize=True \ - --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ - --batch_size $BATCH_SIZE + --model_args "pretrained=$MODEL,parallelize=True" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 4d32b49a4fac3..65be3c5d93b20 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do done lm_eval --model vllm \ - --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \ - --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ - --batch_size $BATCH_SIZE + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh index b4fdde6dab425..26f33b744289a 100644 --- a/.buildkite/lm-eval-harness/run-tests.sh +++ b/.buildkite/lm-eval-harness/run-tests.sh @@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do done # Parse list of configs. -IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" do diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh index e9d7d6a8d760a..fb5063db86942 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh @@ -50,31 +50,30 @@ launch_trt_server() { git clone https://github.com/triton-inference-server/tensorrtllm_backend.git git lfs install cd tensorrtllm_backend - git checkout $trt_llm_version - tensorrtllm_backend_dir=$(pwd) + git checkout "$trt_llm_version" git submodule update --init --recursive # build trtllm engine cd /tensorrtllm_backend - cd ./tensorrt_llm/examples/${model_type} + cd "./tensorrt_llm/examples/${model_type}" python3 convert_checkpoint.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} + --model_dir "${model_path}" \ + --dtype "${model_dtype}" \ + --tp_size "${model_tp_size}" \ + --output_dir "${trt_model_path}" trtllm-build \ - --checkpoint_dir ${trt_model_path} \ + --checkpoint_dir "${trt_model_path}" \ --use_fused_mlp \ --reduce_fusion disable \ --workers 8 \ - --gpt_attention_plugin ${model_dtype} \ - --gemm_plugin ${model_dtype} \ - --tp_size ${model_tp_size} \ - --max_batch_size ${max_batch_size} \ - --max_input_len ${max_input_len} \ - --max_seq_len ${max_seq_len} \ - --max_num_tokens ${max_num_tokens} \ - --output_dir ${trt_engine_path} + --gpt_attention_plugin "${model_dtype}" \ + --gemm_plugin "${model_dtype}" \ + --tp_size "${model_tp_size}" \ + --max_batch_size "${max_batch_size}" \ + --max_input_len "${max_input_len}" \ + --max_seq_len "${max_seq_len}" \ + --max_num_tokens "${max_num_tokens}" \ + --output_dir "${trt_engine_path}" # handle triton protobuf files and launch triton server cd /tensorrtllm_backend @@ -82,15 +81,15 @@ launch_trt_server() { cp -r all_models/inflight_batcher_llm/* triton_model_repo/ cd triton_model_repo rm -rf ./tensorrt_llm/1/* - cp -r ${trt_engine_path}/* ./tensorrt_llm/1 + cp -r "${trt_engine_path}"/* ./tensorrt_llm/1 python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false - python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5 - python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false - python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size - python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1 + python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5" + python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false" + python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size" + python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1" cd /tensorrtllm_backend python3 scripts/launch_triton_server.py \ - --world_size=${model_tp_size} \ + --world_size="${model_tp_size}" \ --model_repo=/tensorrtllm_backend/triton_model_repo & } @@ -98,10 +97,7 @@ launch_trt_server() { launch_tgi_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -129,10 +125,7 @@ launch_tgi_server() { launch_lmdeploy_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") server_command="lmdeploy serve api_server $model \ @@ -149,10 +142,7 @@ launch_sglang_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -185,10 +175,7 @@ launch_vllm_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -217,19 +204,19 @@ launch_vllm_server() { main() { - if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then launch_trt_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then launch_tgi_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then launch_lmdeploy_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then launch_sglang_server fi diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index c6a1bbdeb7d48..686f70dbece6c 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -16,10 +16,10 @@ main() { fi # initial annotation - description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" + #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" # download results - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" mkdir -p results/ /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ ls @@ -30,15 +30,15 @@ main() { /workspace/buildkite-agent artifact upload "results.zip" # upload benchmarking scripts - cd $VLLM_SOURCE_CODE_LOC/ + cd "$VLLM_SOURCE_CODE_LOC/" zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" - cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" # upload benchmarking pipeline /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" - cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md @@ -75,4 +75,4 @@ main() { # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md } -main "$@" \ No newline at end of file +main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index dd8c15e0700eb..3f38cf5137535 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -12,7 +12,7 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" echo "GPU type is $gpu_type" } @@ -102,7 +102,7 @@ kill_gpu_processes() { pkill -f text-generation pkill -f lmdeploy - while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 done } @@ -119,8 +119,8 @@ wait_for_server() { ensure_installed() { # Ensure that the given command is installed by apt-get local cmd=$1 - if ! which $cmd >/dev/null; then - apt-get update && apt-get install -y $cmd + if ! which "$cmd" >/dev/null; then + apt-get update && apt-get install -y "$cmd" fi } @@ -173,13 +173,11 @@ run_serving_tests() { echo "Reuse previous server for test case $test_name" else kill_gpu_processes - bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \ + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ "$server_params" "$common_params" fi - wait_for_server - - if [ $? -eq 0 ]; then + if wait_for_server; then echo "" echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." else @@ -190,13 +188,13 @@ run_serving_tests() { # prepare tokenizer # this is required for lmdeploy. - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" rm -rf /tokenizer_cache mkdir /tokenizer_cache python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ --model "$model" \ --cachedir /tokenizer_cache - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" # change model name for lmdeploy (it will not follow standard hf name) @@ -307,11 +305,11 @@ run_serving_tests() { prepare_dataset() { # download sharegpt dataset - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # duplicate sonnet by 4x, to allow benchmarking with input length 2048 - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" echo "" > sonnet_4x.txt for _ in {1..4} do @@ -339,17 +337,17 @@ main() { prepare_dataset - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" # run the test - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" # upload benchmark results to buildkite python3 -m pip install tabulate pandas - python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py + python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" upload_to_buildkite } diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index a0b9a409b758d..d397b05cdff23 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -17,7 +17,7 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') echo "GPU type is $gpu_type" } @@ -93,7 +93,7 @@ kill_gpu_processes() { # wait until GPU memory usage smaller than 1GB - while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 done @@ -117,7 +117,7 @@ upload_to_buildkite() { fi # Use the determined command to annotate and upload artifacts - $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md + $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" } @@ -150,7 +150,7 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -206,9 +206,9 @@ run_throughput_tests() { throughput_args=$(json2args "$throughput_params") # check if there is enough GPU to run the test - tp=$(echo $throughput_params | jq -r '.tensor_parallel_size') + tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -270,7 +270,7 @@ run_serving_tests() { # check if there is enough GPU to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -278,7 +278,7 @@ run_serving_tests() { server_model=$(echo "$server_params" | jq -r '.model') client_model=$(echo "$client_params" | jq -r '.model') if [[ $server_model != "$client_model" ]]; then - echo "Server model and client model must be the same. Skip testcase $testname." + echo "Server model and client model must be the same. Skip testcase $test_name." continue fi @@ -293,8 +293,7 @@ run_serving_tests() { server_pid=$! # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then + if wait_for_server; then echo "" echo "vllm server is up and running." else diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh index f16862907def1..19f7160e68a4d 100644 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -6,7 +6,7 @@ TIMEOUT_SECONDS=10 retries=0 while [ $retries -lt 1000 ]; do - if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then + if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then exit 0 fi @@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do sleep 5 done -exit 1 \ No newline at end of file +exit 1 diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 860272e71fd84..902e162720b89 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script runs test inside the corresponding ROCm docker container. set -o pipefail @@ -57,17 +59,17 @@ done echo "--- Pulling container" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" -docker pull ${image_name} +docker pull "${image_name}" remove_docker_container() { - docker rm -f ${container_name} || docker image rm -f ${image_name} || true + docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true } trap remove_docker_container EXIT echo "--- Running container" HF_CACHE="$(realpath ~)/huggingface" -mkdir -p ${HF_CACHE} +mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" commands=$@ @@ -118,25 +120,25 @@ if [[ $commands == *"--shard-id="* ]]; then --network host \ --shm-size=16gb \ --rm \ - -e HIP_VISIBLE_DEVICES=${GPU} \ + -e HIP_VISIBLE_DEVICES="${GPU}" \ -e HF_TOKEN \ - -v ${HF_CACHE}:${HF_MOUNT} \ - -e HF_HOME=${HF_MOUNT} \ - --name ${container_name}_${GPU} \ - ${image_name} \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + --name "${container_name}_${GPU}" \ + "${image_name}" \ /bin/bash -c "${commands_gpu}" \ |& while read -r line; do echo ">>Shard $GPU: $line"; done & PIDS+=($!) done #wait for all processes to finish and collect exit codes - for pid in ${PIDS[@]}; do - wait ${pid} + for pid in "${PIDS[@]}"; do + wait "${pid}" STATUS+=($?) done - for st in ${STATUS[@]}; do + for st in "${STATUS[@]}"; do if [[ ${st} -ne 0 ]]; then echo "One of the processes failed with $st" - exit ${st} + exit "${st}" fi done else @@ -147,9 +149,9 @@ else --rm \ -e HIP_VISIBLE_DEVICES=0 \ -e HF_TOKEN \ - -v ${HF_CACHE}:${HF_MOUNT} \ - -e HF_HOME=${HF_MOUNT} \ - --name ${container_name} \ - ${image_name} \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + --name "${container_name}" \ + "${image_name}" \ /bin/bash -c "${commands}" fi diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh index cbf6dda677c53..1641c1faa9d6a 100644 --- a/.buildkite/run-benchmarks.sh +++ b/.buildkite/run-benchmarks.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script is run by buildkite to run the benchmarks and upload the results to buildkite set -ex diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 06c48f8ed5613..9fbacaab0ec47 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex @@ -13,7 +15,7 @@ docker build -t cpu-test -f Dockerfile.ppc64le . # Run the image, setting --shm-size=4g for tensor parallel. source /etc/environment #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test # Run basic model test docker exec cpu-test bash -c " diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 2dbeee8562971..064d7c77ab570 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 7ac4dcc4c786d..530bf90a855fe 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -14,7 +14,7 @@ DOCKER_IMAGE=$4 shift 4 COMMANDS=("$@") -if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then +if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then echo "The number of commands must be equal to the number of nodes." echo "Number of nodes: $NUM_NODES" echo "Number of commands: ${#COMMANDS[@]}" @@ -23,7 +23,7 @@ fi echo "List of commands" for command in "${COMMANDS[@]}"; do - echo $command + echo "$command" done start_network() { @@ -36,7 +36,7 @@ start_nodes() { for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) GPU_DEVICES+=$(($DEVICE_NUM)) - if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then + if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then GPU_DEVICES+=',' fi done @@ -49,17 +49,20 @@ start_nodes() { # 3. map the huggingface cache directory to the container # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: # starting from 192.168.10.11) - docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" + docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \ + -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \ + --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \ + /bin/bash -c "tail -f /dev/null" # organize containers into a ray cluster - if [ $node -eq 0 ]; then + if [ "$node" -eq 0 ]; then # start the ray head node - docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block" + docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block" # wait for the head node to be ready sleep 10 else # start the ray worker nodes, and connect them to the head node - docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block" + docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block" fi done @@ -79,22 +82,22 @@ run_nodes() { for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) GPU_DEVICES+=$(($DEVICE_NUM)) - if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then + if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then GPU_DEVICES+=',' fi done GPU_DEVICES+='"' echo "Running node$node with GPU devices: $GPU_DEVICES" - if [ $node -ne 0 ]; then - docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" + if [ "$node" -ne 0 ]; then + docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" else - docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" + docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" fi done } cleanup() { for node in $(seq 0 $(($NUM_NODES-1))); do - docker stop node$node + docker stop "node$node" done docker network rm docker-net } diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 252c0f7fecd12..9259391aaed49 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the Neuron docker image and run the API server inside the container. # It serves a sanity check for compilation and basic model usage. set -e @@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then docker system prune -f - echo $current_time > /tmp/neuron-docker-build-timestamp + echo "$current_time" > /tmp/neuron-docker-build-timestamp fi else - echo $(date +%s) > /tmp/neuron-docker-build-timestamp + date "+%s" > /tmp/neuron-docker-build-timestamp fi docker build -t neuron -f Dockerfile.neuron . @@ -34,7 +36,7 @@ wait_for_server_to_start() { timeout=300 counter=0 - while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do + while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do sleep 1 counter=$((counter + 1)) if [ $counter -ge $timeout ]; then diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 35ad5c0ddde77..6b12f424fd828 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the OpenVINO docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 988d5aef5fb8c..770dad6ffa3a1 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e # Build the docker image. @@ -12,4 +14,4 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 6ffa66d5ef3d6..faeac8e2ded36 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh index 312c6e82f33a3..3d0b7a1fe0402 100644 --- a/.github/workflows/scripts/cuda-install.sh +++ b/.github/workflows/scripts/cuda-install.sh @@ -1,16 +1,16 @@ #!/bin/bash # Replace '.' with '-' ex: 11.8 -> 11-8 -cuda_version=$(echo $1 | tr "." "-") +cuda_version=$(echo "$1" | tr "." "-") # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 -OS=$(echo $2 | tr -d ".\-") +OS=$(echo "$2" | tr -d ".\-") # Installs CUDA -wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb +wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb" sudo dpkg -i cuda-keyring_1.1-1_all.deb rm cuda-keyring_1.1-1_all.deb sudo apt -qq update -sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} +sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}" sudo apt clean # Test nvcc diff --git a/.github/workflows/scripts/pytorch-install.sh b/.github/workflows/scripts/pytorch-install.sh index dfc1851d7692c..e3cda7dad2d17 100644 --- a/.github/workflows/scripts/pytorch-install.sh +++ b/.github/workflows/scripts/pytorch-install.sh @@ -6,7 +6,7 @@ cuda_version=$3 # Install torch $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya -$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} +$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}" # Print version information $python_executable --version diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml new file mode 100644 index 0000000000000..ac43b20c31390 --- /dev/null +++ b/.github/workflows/shellcheck.yml @@ -0,0 +1,37 @@ +name: Lint shell scripts +on: + push: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + pull_request: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + fetch-depth: 0 + + - name: "Check shell scripts" + run: | + tools/shellcheck.sh diff --git a/.gitignore b/.gitignore index 1ea6e3419db2a..ceef6a5fba456 100644 --- a/.gitignore +++ b/.gitignore @@ -202,3 +202,4 @@ benchmarks/*.json # Linting actionlint +shellcheck*/ diff --git a/.shellcheckrc b/.shellcheckrc new file mode 100644 index 0000000000000..f3b6eedf8d907 --- /dev/null +++ b/.shellcheckrc @@ -0,0 +1,9 @@ +# rules currently disabled: +# +# SC1091 (info): Not following: was not specified as input (see shellcheck -x) +# SC2004 (style): $/${} is unnecessary on arithmetic variables. +# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects. +# SC2155 (warning): Declare and assign separately to avoid masking return values. +# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails. +# +disable=SC1091,SC2004,SC2129,SC2155,SC2164 diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh index 8c5cd454fbbee..ba7383d88dc49 100755 --- a/benchmarks/launch_tgi_server.sh +++ b/benchmarks/launch_tgi_server.sh @@ -4,13 +4,13 @@ PORT=8000 MODEL=$1 TOKENS=$2 -docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \ - -v $PWD/data:/data \ +docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \ + -v "$PWD/data:/data" \ ghcr.io/huggingface/text-generation-inference:2.2.0 \ - --model-id $MODEL \ + --model-id "$MODEL" \ --sharded false \ --max-input-length 1024 \ --max-total-tokens 2048 \ --max-best-of 5 \ --max-concurrent-requests 5000 \ - --max-batch-total-tokens $TOKENS + --max-batch-total-tokens "$TOKENS" diff --git a/examples/run_cluster.sh b/examples/run_cluster.sh index 8e4aa59e1766d..7b4b40b4b7e23 100644 --- a/examples/run_cluster.sh +++ b/examples/run_cluster.sh @@ -14,7 +14,7 @@ PATH_TO_HF_HOME="$4" shift 4 # Additional arguments are passed directly to the Docker command -ADDITIONAL_ARGS="$@" +ADDITIONAL_ARGS=("$@") # Validate node type if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then @@ -45,5 +45,5 @@ docker run \ --shm-size 10.24g \ --gpus all \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ - ${ADDITIONAL_ARGS} \ + "${ADDITIONAL_ARGS[@]}" \ "${DOCKER_IMAGE}" -c "${RAY_START_CMD}" diff --git a/format.sh b/format.sh index be6ee0ce46dcb..d06ee62351a21 100755 --- a/format.sh +++ b/format.sh @@ -44,14 +44,14 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') # # params: tool name, tool version, required version tool_version_check() { - if [[ $2 != $3 ]]; then + if [[ "$2" != "$3" ]]; then echo "❓❓Wrong $1 version installed: $3 is required, not $2." exit 1 fi } -tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "yapf" "$YAPF_VERSION" "$(grep yapf requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "ruff" "$RUFF_VERSION" "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)" tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)" tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)" tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)" @@ -294,6 +294,10 @@ echo 'vLLM actionlint:' tools/actionlint.sh -color echo 'vLLM actionlint: Done' +echo 'vLLM shellcheck:' +tools/shellcheck.sh +echo 'vLLM shellcheck: Done' + if ! git diff --quiet &>/dev/null; then echo echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh index e80c1d6c5849c..a4d0c44c22b51 100755 --- a/tests/weight_loading/run_model_weight_loading_test.sh +++ b/tests/weight_loading/run_model_weight_loading_test.sh @@ -14,7 +14,7 @@ while getopts "c:" OPT; do done -IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" do diff --git a/tools/mypy.sh b/tools/mypy.sh index 7e8f7d402cdd5..e984e739d70cf 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -3,13 +3,13 @@ CI=${1:-0} PYTHON_VERSION=${2:-3.9} -if [ $CI -eq 1 ]; then +if [ "$CI" -eq 1 ]; then set -e fi run_mypy() { echo "Running mypy on $1" - if [ $CI -eq 1 ] && [ -z "$1" ]; then + if [ "$CI" -eq 1 ] && [ -z "$1" ]; then mypy --python-version "${PYTHON_VERSION}" "$@" return fi diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh new file mode 100755 index 0000000000000..e850742a07900 --- /dev/null +++ b/tools/shellcheck.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +scversion="stable" + +if [ -d "shellcheck-${scversion}" ]; then + export PATH="$PATH:$(pwd)/shellcheck-${scversion}" +fi + +if ! [ -x "$(command -v shellcheck)" ]; then + if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then + echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing" + exit 1 + fi + + # automatic local install if linux x86_64 + wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv + export PATH="$PATH:$(pwd)/shellcheck-${scversion}" +fi + +# TODO - fix warnings in .buildkite/run-amd-test.sh +find . -name "*.sh" -not -path "./.deps/*" -not -path "./.buildkite/run-amd-test.sh" -exec shellcheck {} +