diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml index e4cf05c0..d8443ccd 100644 --- a/.jenkins/test_config.yaml +++ b/.jenkins/test_config.yaml @@ -81,13 +81,12 @@ stages: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=1 bash run-tests.sh -c configs/models-fp8.txt -t 2 - # Chendi: llama4 upstream modeling changed, need to fix - # - name: gsm8k_fp8_llama4_scout_g3_tp2_compressed_tensor - # flavor: g3.s - # command: >- - # cd .jenkins/lm-eval-harness && - # VLLM_CONTIGUOUS_PA=False PT_HPU_LAZY_MODE=1 - # bash run-tests.sh -c configs/models-fp8-compressedtensor.txt -t 2 + - name: gsm8k_fp8_llama4_scout_g3_tp2_compressed_tensor + flavor: g3.s + command: >- + cd .jenkins/lm-eval-harness && + VLLM_WEIGHT_LOAD_FORCE_SYNC=1 VLLM_CONTIGUOUS_PA=False PT_HPU_LAZY_MODE=1 + bash run-tests.sh -c configs/models-fp8-compressedtensor.txt -t 2 # Chendi: crash on model weight loading, need to fix # - name: gsm8k_fp8_qwen3_30B_g3_tp1_block_scale_dynamic # flavor: g3 @@ -102,10 +101,9 @@ stages: # cd .jenkins/lm-eval-harness && # VLLM_CONTIGUOUS_PA=False PT_HPU_LAZY_MODE=1 VLLM_HPU_FORCE_CHANNEL_FP8=0 # bash run-tests.sh -c configs/models-fp8-blockfp8.txt -t 1 - # Chendi: comment multimodal test since it is not enabled in V1 yet. - # - name: multimodal_llama4_scout_g3_tp2_ep - # flavor: g3.s - # command: >- - # cd .jenkins/vision && - # PT_HPU_LAZY_MODE=1 VLLM_WEIGHT_LOAD_FORCE_SYNC=1 - # bash run-tests.sh -c configs/models-llama4-scout.txt -t 2 \ No newline at end of file + - name: multimodal_llama4_scout_g3_tp2_ep + flavor: g3.s + command: >- + cd .jenkins/vision && + PT_HPU_LAZY_MODE=1 VLLM_WEIGHT_LOAD_FORCE_SYNC=1 + bash run-tests.sh -c configs/models-llama4-scout.txt -t 2 \ No newline at end of file diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index 58b9b08c..072e0e83 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -19,7 +19,8 @@ AttentionType) from vllm.attention.backends.mla.common import MLACommonImpl from vllm.attention.backends.utils import CommonAttentionState -from vllm_gaudi.attention.ops.hpu_paged_attn import (HPUPagedAttention, HPUPagedAttentionMetadata) +from vllm_gaudi.attention.ops.hpu_paged_attn import (HPUPagedAttention, HPUPagedAttentionMetadata, + HPUPagedAttentionMetadataBuilder) from vllm_gaudi.extension.logger import logger as init_logger from vllm_gaudi.extension.unified import (unified_attn, HPUUnifiedAttentionMetadata) @@ -45,6 +46,10 @@ def get_metadata_cls() -> type["AttentionMetadata"]: def get_state_cls() -> type["CommonAttentionState"]: raise NotImplementedError() + @staticmethod + def get_builder_cls() -> type[HPUPagedAttentionMetadataBuilder]: + return HPUPagedAttentionMetadataBuilder + @staticmethod def get_kv_cache_shape( num_blocks: int, diff --git a/vllm_gaudi/attention/ops/hpu_paged_attn.py b/vllm_gaudi/attention/ops/hpu_paged_attn.py index 8586bbbe..436e6283 100644 --- a/vllm_gaudi/attention/ops/hpu_paged_attn.py +++ b/vllm_gaudi/attention/ops/hpu_paged_attn.py @@ -24,6 +24,28 @@ class HPUPagedAttentionMetadata: alibi_blocks: Optional[torch.Tensor] +@dataclass +class HPUPagedAttentionMetadataBuilder: + + def __init__(self, input_builder: "HPUPageAttentionInputBuilderBase") -> None: + """Create the builder, remember some configuration and parameters.""" + self.input_builder = input_builder + + def prepare(self) -> None: + """Prepare for one batch.""" + pass + + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, + batch_size: int) -> type[HPUPagedAttentionMetadata]: + """Build attention metadata with on-device tensors.""" + return HPUPagedAttentionMetadata + + +@dataclass +class HPUPageAttentionInputBuilderBase: + pass + + class HPUPagedAttention: @staticmethod diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 58f4d7c6..4484fb16 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -1155,7 +1155,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput", req_ids: list self.encoder_cache[mm_hash] = scatter_mm_placeholders( output, - is_embed=pos_info.is_embed, + is_embed=pos_info.is_embed.to( + device=output.device) if pos_info.is_embed is not None else pos_info.is_embed, ) # modified from: vllm/v1/worker/gpu_model_runner.py