Skip to content

Commit 68aaaaf

Browse files
committed
Merge remote-tracking branch 'origin/main' into port_nixl
2 parents 9322571 + e3dd6a6 commit 68aaaaf

25 files changed

+1713
-247
lines changed

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ numpy==1.26.4
55
tabulate
66
setuptools>=77.0.3,<80.0.0
77
setuptools-scm>=8
8+
numba
9+
transformers>=4.1,<4.56.0

tests/full_tests/ci_gsm8k_tests.sh

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ echo "Test with deepseek_v2 + inc passed"
5252

5353
# deepseek v2 + inc + dynamic quantization + tp2
5454
echo "Testing deepseek_v2 + inc dynamic quantization + tp2"
55-
echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
55+
echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --tensor-parallel-size 2
5656
QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json \
5757
HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --tensor-parallel-size 2
5858
if [ $? -ne 0 ]; then
@@ -61,6 +61,26 @@ if [ $? -ne 0 ]; then
6161
fi
6262
echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful"
6363

64+
# QWEN3 + blockfp8 + dynamic scaling
65+
echo "Testing Qwen3-8B-FP8 + blockfp8 + dynamic scaling"
66+
echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code
67+
HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code
68+
if [ $? -ne 0 ]; then
69+
echo "Error: Test failed for Qwen3-8B-FP8 + blockfp8 + dynamic scaling" >&2
70+
exit -1
71+
fi
72+
echo "Test with Qwen3-8B-FP8 + blockfp8 + dynamic scaling successful"
73+
74+
# QWEN3 compressed tensor + dynamic scaling
75+
echo "Testing Qwen3-8B-FP8-dynamic + compressed-tensor + dynamic scaling"
76+
echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model RedHatAI/Qwen3-8B-FP8-dynamic --trust-remote-code
77+
HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model RedHatAI/Qwen3-8B-FP8-dynamic --trust-remote-code
78+
if [ $? -ne 0 ]; then
79+
echo "Error: Test failed for Qwen3-8B-FP8-dynamic + compressed-tensor + dynamic scaling" >&2
80+
exit -1
81+
fi
82+
echo "Test with Qwen3-8B-FP8-dynamic + compressed-tensor + dynamic scaling successful"
83+
6484
# structured output
6585
echo "Testing structured output"
6686
echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/structured_outputs.py
@@ -122,19 +142,16 @@ fi
122142
echo "Test with deepseek R1 passed"
123143

124144
# used to check HPUATTN + MOE + ExpertParallel
125-
#NOTE(adobrzyn): CI broked, to be brought back after fix
126-
echo "Skipping GSM8K on QWEN3-30B-A3B"
127-
128-
# echo "Testing GSM8K on QWEN3-30B-A3B"
129-
# echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
130-
# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
131-
# VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
132-
# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
133-
# if [ $? -ne 0 ]; then
134-
# echo "Error: Test failed for QWEN3-30B-A3B" >&2
135-
# exit -1
136-
# fi
137-
# echo "Test with QWEN3-30B-A3B passed"
145+
echo "Testing GSM8K on QWEN3-30B-A3B"
146+
echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
147+
pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
148+
VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
149+
pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
150+
if [ $? -ne 0 ]; then
151+
echo "Error: Test failed for QWEN3-30B-A3B" >&2
152+
exit -1
153+
fi
154+
echo "Test with QWEN3-30B-A3B passed"
138155

139156
# multimodal-support with qwen2.5-vl
140157
echo "Testing Qwen2.5-VL-7B"
@@ -146,4 +163,15 @@ if [ $? -ne 0 ]; then
146163
echo "Error: Test failed for multimodal-support with qwen2.5-vl-7b" >&2
147164
exit -1
148165
fi
149-
echo "Test with multimodal-support with qwen2.5-vl-7b passed"
166+
echo "Test with multimodal-support with qwen2.5-vl-7b passed"
167+
168+
# spec decode with ngram
169+
# For G3, acc rate is 0.18, but for G2, it is 0.09
170+
echo "Testing Spec-decode with ngram"
171+
echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 python vllm-gaudi/tests/full_tests/spec_decode.py --task ngram --assert_acc_rate 0.09 --osl 1024
172+
VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 python vllm-gaudi/tests/full_tests/spec_decode.py --task ngram --assert_acc_rate 0.09 --osl 1024
173+
if [ $? -ne 0 ]; then
174+
echo "Error: Test failed for spec decode with ngram" >&2
175+
exit -1
176+
fi
177+
echo "Test with spec decode with ngram passed"

0 commit comments

Comments
 (0)