-
Notifications
You must be signed in to change notification settings - Fork 588
/
Copy pathdeepseek-r1-671B-A100.yaml
91 lines (75 loc) · 3.15 KB
/
deepseek-r1-671B-A100.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Ajusted on deepseek-r1-671B.yaml for A100.
name: deepseek-r1-A100
resources:
accelerators: { A100-80GB:8 }
disk_size: 2048 # The model in BF16 format takes about 1.3TB
disk_tier: best
ports: 30000
any_of:
- use_spot: true
- use_spot: false
num_nodes: 4 # Specify number of nodes to launch, the requirement might be different for different accelerators
setup: |
# Install sglang with all dependencies using uv
uv pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer
# Set up shared memory for better performance
sudo bash -c "echo 'vm.max_map_count=655300' >> /etc/sysctl.conf"
sudo sysctl -p
echo "FP8 is not supported on A100, we need to convert the model to BF16 format"
# Conversion script
git clone https://github.com/deepseek-ai/DeepSeek-V3.git deepseek_repo
# A workaround for running conversion script on A100. See https://github.com/deepseek-ai/DeepSeek-V3/issues/4
CONVERSION_SCRIPT="deepseek_repo/inference/fp8_cast_bf16.py"
sed -i 's/new_state_dict\[weight_name\] = weight_dequant(weight, scale_inv)/new_state_dict[weight_name] = weight_dequant(weight.float(), scale_inv)/' $CONVERSION_SCRIPT
uv venv venv_convert && source venv_convert/bin/activate
# setuptools is needed by triton
uv pip install huggingface_hub setuptools -r deepseek_repo/inference/requirements.txt
# Download the model weights and convert to BF16 format
echo "Downloading model weights..."
FP8_MODEL_DIR="DeepSeek-R1-FP8"
python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='deepseek-ai/DeepSeek-R1', local_dir='./$FP8_MODEL_DIR')"
# Convert the model to BF16 format
MODEL_DIR="DeepSeek-R1-BF16"
python $CONVERSION_SCRIPT \
--input-fp8-hf-path $FP8_MODEL_DIR \
--output-bf16-hf-path $MODEL_DIR
if [ $? -ne 0 ]; then
echo "BF16 conversion failed"
exit 1
fi
MODEL_FILES=(
"config.json"
"generation_config.json"
"modeling_deepseek.py"
"configuration_deepseek.py"
"tokenizer.json"
"tokenizer_config.json"
# the bf16 directory has its own model.safetensors.index.json
)
cp "${MODEL_FILES[@]/#/$FP8_MODEL_DIR/}" $MODEL_DIR/
# See https://github.com/sgl-project/sglang/issues/3491
sed -i '/"quantization_config": {/,/}/d' $MODEL_DIR/config.json
echo "BF16 conversion completed. Model saved to $(realpath $MODEL_DIR)"
ls -lh "$MODEL_DIR" # List files for verification
run: |
# Launch the server with appropriate configuration
MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
# TP should be number of GPUs per node times number of nodes
TP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
# For A100, we only export the head node for serving requests
if [ "$SKYPILOT_NODE_RANK" -eq 0 ]; then
HEAD_NODE_ARGS="--host 0.0.0.0 --port 30000"
else
HEAD_NODE_ARGS=""
fi
python -m sglang.launch_server \
--model-path DeepSeek-R1-BF16 \
--tp $TP \
--dist-init-addr ${MASTER_ADDR}:5000 \
--nnodes ${SKYPILOT_NUM_NODES} \
--node-rank ${SKYPILOT_NODE_RANK} \
--trust-remote-code \
--enable-dp-attention \
--enable-torch-compile \
--torch-compile-max-bs 8 \
$HEAD_NODE_ARGS