llm/deepseek-r1/deepseek-r1-671B-A100.yaml

# Ajusted on deepseek-r1-671B.yaml for A100.
name: deepseek-r1-A100

resources:
  accelerators: { A100-80GB:8 }
  disk_size: 2048 # The model in BF16 format takes about 1.3TB
  disk_tier: best
  ports: 30000
  any_of:
    - use_spot: true
    - use_spot: false

num_nodes: 4 # Specify number of nodes to launch, the requirement might be different for different accelerators

setup: |
  # Install sglang with all dependencies using uv
  uv pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer

  # Set up shared memory for better performance
  sudo bash -c "echo 'vm.max_map_count=655300' >> /etc/sysctl.conf"
  sudo sysctl -p

  echo "FP8 is not supported on A100, we need to convert the model to BF16 format"

  # Conversion script
  git clone https://github.com/deepseek-ai/DeepSeek-V3.git deepseek_repo
  # A workaround for running conversion script on A100. See https://github.com/deepseek-ai/DeepSeek-V3/issues/4
  CONVERSION_SCRIPT="deepseek_repo/inference/fp8_cast_bf16.py"
  sed -i 's/new_state_dict\[weight_name\] = weight_dequant(weight, scale_inv)/new_state_dict[weight_name] = weight_dequant(weight.float(), scale_inv)/' $CONVERSION_SCRIPT

  uv venv venv_convert && source venv_convert/bin/activate

  # setuptools is needed by triton
  uv pip install huggingface_hub setuptools -r deepseek_repo/inference/requirements.txt

  # Download the model weights and convert to BF16 format
  echo "Downloading model weights..."
  FP8_MODEL_DIR="DeepSeek-R1-FP8"
  python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='deepseek-ai/DeepSeek-R1', local_dir='./$FP8_MODEL_DIR')"

  # Convert the model to BF16 format
  MODEL_DIR="DeepSeek-R1-BF16"
  python $CONVERSION_SCRIPT \
    --input-fp8-hf-path $FP8_MODEL_DIR \
    --output-bf16-hf-path $MODEL_DIR

  if [ $? -ne 0 ]; then
    echo "BF16 conversion failed"
    exit 1
  fi

  MODEL_FILES=(
  "config.json"
  "generation_config.json"
  "modeling_deepseek.py"
  "configuration_deepseek.py"
  "tokenizer.json"
  "tokenizer_config.json"
  # the bf16 directory has its own model.safetensors.index.json
  )
  cp "${MODEL_FILES[@]/#/$FP8_MODEL_DIR/}" $MODEL_DIR/
  # See https://github.com/sgl-project/sglang/issues/3491
  sed -i '/"quantization_config": {/,/}/d' $MODEL_DIR/config.json

  echo "BF16 conversion completed. Model saved to $(realpath $MODEL_DIR)"
  ls -lh "$MODEL_DIR"  # List files for verification

run: |
  # Launch the server with appropriate configuration
  MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
  # TP should be number of GPUs per node times number of nodes
  TP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))

  # For A100, we only export the head node for serving requests
  if [ "$SKYPILOT_NODE_RANK" -eq 0 ]; then
      HEAD_NODE_ARGS="--host 0.0.0.0 --port 30000"
  else
      HEAD_NODE_ARGS=""
  fi

  python -m sglang.launch_server \
    --model-path DeepSeek-R1-BF16 \
    --tp $TP \
    --dist-init-addr ${MASTER_ADDR}:5000 \
    --nnodes ${SKYPILOT_NUM_NODES} \
    --node-rank ${SKYPILOT_NODE_RANK} \
    --trust-remote-code \
    --enable-dp-attention \
    --enable-torch-compile \
    --torch-compile-max-bs 8 \
    $HEAD_NODE_ARGS