code review changes

hsubramony · hsubramony · commit 7e56265854fa · 2025-09-03T19:57:02.000Z
Signed-off-by: Harish Subramony &lt;hsubramony@habana.ai&gt;
diff --git a/.github/workflows/pre-merge.yaml b/.github/workflows/pre-merge.yaml
@@ -47,9 +47,8 @@ jobs:
           RUN git checkout main
 
           # Pinning versions in requirements might be good practice for CI consistency
-          RUN pip install pytest pytest_asyncio nixl==0.4.1
+          RUN pip install pytest pytest_asyncio
           RUN pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
-          RUN pip install lm-eval[api]
 
           ENV no_proxy=localhost,127.0.0.1
           ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
@@ -114,7 +113,10 @@ jobs:
             -e HF_HOME=/workspace/hf_cache \
             -v /mnt/hf_cache:/workspace/hf_cache \
             hpu-plugin-v1-test-env-pre-merge-${{ github.event.pull_request.head.sha }} \
-            /bin/bash "/workspace/vllm-gaudi/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh"
+            /bin/bash -c "
+                pip install nixl==0.4.1 lm-eval[api] &&
+                /workspace/vllm-gaudi/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+            "
 
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ tabulate
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 numba
-transformers>=4.1,<4.56.0
+transformers>=4.1,<4.56.0
+nixl==0.4.1
diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
diff --git a/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_base.py b/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_base.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
 import torch
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, CopyBlocksOp)
+    KVConnectorBase_V1)
 from vllm_gaudi.extension.logger import logger as init_logger
 
 logger = init_logger()
@@ -24,14 +24,6 @@ def from_raw_dict(
         return None
 
 
-def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
-    """
-    Set the xPU-specific ops for copying KV between host and device.
-    Needed when host buffer is used for kv transfer (e.g., in NixlConnector)
-    """
-    return
-
-
 # ==============================
 # Scheduler-side methods
 # ==============================
@@ -44,6 +36,5 @@ def set_kv_transfer_params(self, request: "Request"):
         request.raw_kv_transfer_params)
     request.kv_transfer_params = kv_transfer_params
 
-KVConnectorBase_V1.set_host_xfer_buffer_ops = set_host_xfer_buffer_ops
 KVConnectorBase_V1.set_kv_transfer_params = set_kv_transfer_params
 
diff --git a/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py b/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py
@@ -73,43 +73,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
     # will only affects the strides. For MLA instead, we make require no
     # such thing and resort to the standard layout.
     use_mla = len(first_kv_cache.shape) == 3 if self.device_type != "hpu" else False
-    if self.device_type == "tpu":
-        assert not use_mla, f"{self.kv_buffer_device} does not support MLA."
-        assert self._use_pallas_v1, f"attn backend: {self.backend_name}"
-        # tpu (v1) kv shape per layer:
-        # (num_blocks, block_size, num_kv_heads * 2, head_size)
-        self.num_blocks = first_kv_cache.shape[0]
-        block_rank = 3  # [block_size, kv_heads, head_dim]
-        block_shape = first_kv_cache.shape[-block_rank:]
-        block_size, n_kv_heads_x_2, head_dim = block_shape
-        self.slot_size_bytes = kv_elem_size * n_kv_heads_x_2 * head_dim
-    elif self.device_type == "cuda":
-        assert use_mla == self.use_mla
-        # TODO (NickLucche) not compatible with hybrid allocator.
-        # Enforce check once it goes live, as a single kv layout
-        # is expected for xfers.
-        if use_mla:
-            # MLA case.
-            self.num_blocks = first_kv_cache.shape[0]
-            block_rank = 2  # [block_size, latent_dim]
-            block_shape = first_kv_cache.shape[-block_rank:]
-            block_size, kv_latent_dim = block_shape
-            self.slot_size_bytes = kv_elem_size * kv_latent_dim
-        else:
-            # [2 (k and v), num_blocks, ...]
-            if self._use_flashinfer:
-                # FlashInfer swaps 2<->num_blocks dimensions.
-                self.num_blocks = first_kv_cache.shape[0]
-                block_rank = 4  # [2, block_size, kv_heads, head_dim]
-            else:
-                self.num_blocks = first_kv_cache.shape[1]
-                block_rank = 3  # [block_size, kv_heads, head_dim]
-            block_shape = first_kv_cache.shape[-block_rank:]
-            block_size, n_kv_heads, head_dim = block_shape[-3:]
-            # head size in bytes.
-            self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
-        assert block_size == self.block_size
-    elif self.device_type == "hpu":
+    if self.device_type == "hpu":
         # habana kv_cache: [2, num_blocks*block_size, kv_heads, head_dim]
         #from remote_pdb import RemotePdb; RemotePdb('0.0.0.0', 4444).set_trace()
         self.num_blocks = first_kv_cache[0].shape[0] // self.block_size
diff --git a/vllm_gaudi/platform.py b/vllm_gaudi/platform.py
@@ -78,7 +78,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 128
-        #vllm_config.kv_transfer_config.kv_buffer_device = 'hpu'
         if (parallel_config.distributed_executor_backend in ['mp', 'uni']
                 and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
             if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
@@ -121,8 +120,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
     @classmethod
     def is_pin_memory_available(cls):
-        logger.warning("Pin memory is supported on HPU.")
-        return True
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -622,7 +622,7 @@ def __init__(
             self.parallel_config)
         self.head_size = self.model_config.get_head_size()
         self.hidden_size = self.model_config.get_hidden_size()
-        logger.debug(f'buke model config: {self.model_config=}')
+        logger.debug(f'model config: {self.model_config=}')
         self.attn_backend = get_attn_backend(
             self.head_size,
             self.dtype,
@@ -2302,7 +2302,7 @@ def execute_model(
             if not has_kv_transfer_group():
                 # Return empty ModelRunnerOuptut if there's no work to do.
                 return EMPTY_MODEL_RUNNER_OUTPUT
-            #logger.info(f'buke before kv_connector_no_forward |{os.getpid()=}|{scheduler_output.total_num_scheduled_tokens=}|{scheduler_output=}')
+            #logger.debug(f'before kv_connector_no_forward |{os.getpid()=}|{scheduler_output.total_num_scheduled_tokens=}|{scheduler_output=}')
             # For D case, wait until kv finish load here
             return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
         # If necessary, swap decodes/prompts to have all decodes on the start
@@ -2658,7 +2658,6 @@ def execute_model(
                 finished_recving=finished_recving,
             )
         )
-        #logger.debug(f"buke hpu_model_runner.py: {model_runner_output=}")
         if has_kv_transfer_group():
             get_kv_transfer_group().clear_connector_metadata()
         return model_runner_output
@@ -3181,7 +3180,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                     v_cache_shape = None if self.model_config.use_mla \
                         else kv_cache_shape
                     dtype = kv_cache_spec.dtype
-                    #logger.debug(f'buke: |{os.getpid()=}|{kv_cache_shape=}')
+                    logger.debug(f'|{os.getpid()=}|{kv_cache_shape=}')
                     key_cache = torch.zeros(kv_cache_shape,
                                             dtype=dtype,
                                             device=self.device)