@@ -52,7 +52,7 @@ echo "Test with deepseek_v2 + inc passed"
52
52
53
53
# deepseek v2 + inc + dynamic quantization + tp2
54
54
echo " Testing deepseek_v2 + inc dynamic quantization + tp2"
55
- echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
55
+ echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --tensor-parallel-size 2
56
56
QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json \
57
57
HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --tensor-parallel-size 2
58
58
if [ $? -ne 0 ]; then
@@ -61,6 +61,26 @@ if [ $? -ne 0 ]; then
61
61
fi
62
62
echo " Test with deepseek_v2 + inc dynamic quantization + tp 2 successful"
63
63
64
+ # QWEN3 + blockfp8 + dynamic scaling
65
+ echo " Testing Qwen3-8B-FP8 + blockfp8 + dynamic scaling"
66
+ echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code
67
+ HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code
68
+ if [ $? -ne 0 ]; then
69
+ echo " Error: Test failed for Qwen3-8B-FP8 + blockfp8 + dynamic scaling" >&2
70
+ exit -1
71
+ fi
72
+ echo " Test with Qwen3-8B-FP8 + blockfp8 + dynamic scaling successful"
73
+
74
+ # QWEN3 compressed tensor + dynamic scaling
75
+ echo " Testing Qwen3-8B-FP8-dynamic + compressed-tensor + dynamic scaling"
76
+ echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model RedHatAI/Qwen3-8B-FP8-dynamic --trust-remote-code
77
+ HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model RedHatAI/Qwen3-8B-FP8-dynamic --trust-remote-code
78
+ if [ $? -ne 0 ]; then
79
+ echo " Error: Test failed for Qwen3-8B-FP8-dynamic + compressed-tensor + dynamic scaling" >&2
80
+ exit -1
81
+ fi
82
+ echo " Test with Qwen3-8B-FP8-dynamic + compressed-tensor + dynamic scaling successful"
83
+
64
84
# structured output
65
85
echo " Testing structured output"
66
86
echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/structured_outputs.py
122
142
echo " Test with deepseek R1 passed"
123
143
124
144
# used to check HPUATTN + MOE + ExpertParallel
125
- # NOTE(adobrzyn): CI broked, to be brought back after fix
126
- echo " Skipping GSM8K on QWEN3-30B-A3B"
127
-
128
- # echo "Testing GSM8K on QWEN3-30B-A3B"
129
- # echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
130
- # pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
131
- # VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
132
- # pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
133
- # if [ $? -ne 0 ]; then
134
- # echo "Error: Test failed for QWEN3-30B-A3B" >&2
135
- # exit -1
136
- # fi
137
- # echo "Test with QWEN3-30B-A3B passed"
145
+ echo " Testing GSM8K on QWEN3-30B-A3B"
146
+ echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
147
+ pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
148
+ VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
149
+ pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
150
+ if [ $? -ne 0 ]; then
151
+ echo " Error: Test failed for QWEN3-30B-A3B" >&2
152
+ exit -1
153
+ fi
154
+ echo " Test with QWEN3-30B-A3B passed"
138
155
139
156
# multimodal-support with qwen2.5-vl
140
157
echo " Testing Qwen2.5-VL-7B"
@@ -146,4 +163,15 @@ if [ $? -ne 0 ]; then
146
163
echo " Error: Test failed for multimodal-support with qwen2.5-vl-7b" >&2
147
164
exit -1
148
165
fi
149
- echo " Test with multimodal-support with qwen2.5-vl-7b passed"
166
+ echo " Test with multimodal-support with qwen2.5-vl-7b passed"
167
+
168
+ # spec decode with ngram
169
+ # For G3, acc rate is 0.18, but for G2, it is 0.09
170
+ echo " Testing Spec-decode with ngram"
171
+ echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 python vllm-gaudi/tests/full_tests/spec_decode.py --task ngram --assert_acc_rate 0.09 --osl 1024
172
+ VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 python vllm-gaudi/tests/full_tests/spec_decode.py --task ngram --assert_acc_rate 0.09 --osl 1024
173
+ if [ $? -ne 0 ]; then
174
+ echo " Error: Test failed for spec decode with ngram" >&2
175
+ exit -1
176
+ fi
177
+ echo " Test with spec decode with ngram passed"
0 commit comments