Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }}
ut_tracker: ${{ steps.filter.outputs.ut_tracker }}
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
Expand Down Expand Up @@ -130,9 +130,9 @@ jobs:
verbose: true

e2e:
needs: [lint, changes]
needs: [changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
if: ${{ github.event_name == 'pull_request' && needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
if: ${{ github.event_name == 'pull_request' && needs.changes.outputs.e2e_tracker == 'true' }}
strategy:
max-parallel: 2
matrix:
Expand Down Expand Up @@ -160,15 +160,15 @@ jobs:
apt install git -y

- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5
uses: actions/checkout@v4

- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev

- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: ${{ matrix.vllm_version }}
Expand All @@ -192,7 +192,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
run: |
pytest -sv tests/e2e/singlecard/test_offline_inference.py
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
# pytest -sv tests/e2e/singlecard/test_ilama_lora.py
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
pytest -sv tests/e2e/singlecard/test_camem.py
pytest -sv tests/e2e/singlecard/test_embedding.py
Expand Down Expand Up @@ -242,15 +242,15 @@ jobs:
apt install git -y

- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5
uses: actions/checkout@v4

- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev

- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: ${{ matrix.vllm_version }}
Expand All @@ -273,7 +273,7 @@ jobs:
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
run: |
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
# pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
# To avoid oom, we need to run the test in a single process.
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference_audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from vllm.assets.audio import AudioAsset
try:
import librosa
import librosa # type: ignore
except ImportError:
raise Exception("Can't import librosa, please ensure it's installed")

Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/singlecard/sample/test_rejection_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
import torch
import torch.nn.functional as F
from vllm.v1.sample.logits_processor import LogitsProcessorManager
from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata

Expand Down Expand Up @@ -66,7 +66,7 @@ def create_sampling_metadata(
output_token_ids=[],
allowed_token_ids_mask=None,
bad_words_token_ids={},
logitsprocs=LogitsProcessorManager())
logitsprocs=LogitsProcessors())


########################### Tests for Greedy Sampling ###################
Expand Down
114 changes: 59 additions & 55 deletions tests/ut/attention/test_attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
AscendAttentionState,
AscendMetadata,
CommonAttentionState)
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata


class TestAscendAttentionBackend(TestBase):
Expand Down Expand Up @@ -67,8 +68,12 @@ def test_copy_blocks(self):
class TestAscendAttentionMetadataBuilder(TestBase):

def setUp(self):
self.mock_runner = MagicMock()
self.builder = AscendAttentionMetadataBuilder(self.mock_runner)
self.mock_vllm_config = MagicMock()
self.mock_vllm_config.model_config.max_model_len = 640
self.mock_vllm_config.cache_config.block_size = 64
self.mock_device = 'cpu:0'
self.builder = AscendAttentionMetadataBuilder(self.mock_vllm_config,
self.mock_device)

def test_reorder_batch(self):
mock_input_batch = MagicMock()
Expand All @@ -86,31 +91,28 @@ def test_reorder_batch(self):
def test_build_prefill_no_cache(self, mock_is_310p, mock_nd_to_nz_2d,
mock_npu_format_cast,
mock_ascend_metadata):
num_reqs = 2
num_actual_tokens = 10
max_query_len = 5

self.mock_runner.input_batch.block_table = [MagicMock()]
self.mock_runner.input_batch.block_table[
0].get_device_tensor.return_value = torch.zeros((10, 10))
self.mock_runner.max_num_blocks_per_req = 10
self.mock_runner.query_lens = torch.tensor([3, 4])
self.mock_runner.seq_lens_cpu = torch.tensor([5, 6])
self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
self.mock_runner.device = 'cpu:0'
self.mock_runner.attn_mask = torch.ones((10, 10))
self.mock_runner.attn_state = AscendAttentionState.PrefillNoCache
self.mock_runner.query_start_loc_cpu = torch.tensor([0, 3, 7])
common_attn_metadata = AscendCommonAttentionMetadata(
query_start_loc=torch.tensor([0, 3, 7]),
query_start_loc_cpu=torch.tensor([0, 3, 7]),
seq_lens_cpu=torch.tensor([5, 6]),
num_reqs=2,
num_actual_tokens=10,
max_query_len=5,
decode_token_per_req=torch.tensor([1, 1]),
block_table_tensor=torch.zeros((10, 10)),
slot_mapping_cpu=torch.tensor(range(20)),
actual_seq_lengths_q=torch.tensor([0, 1]),
positions=torch.tensor([10, 10]),
attn_mask=torch.ones((10, 10)),
spec_attn_mask=None,
attn_state=AscendAttentionState.PrefillNoCache)

mock_nz_tensor = MagicMock()
mock_model = MagicMock()
mock_nd_to_nz_2d.return_value = mock_nz_tensor
mock_npu_format_cast.return_value = mock_nz_tensor

self.builder.build(
num_reqs,
num_actual_tokens,
max_query_len,
)
self.builder.build(common_attn_metadata, mock_model)

@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
@patch('torch_npu.npu_format_cast')
Expand All @@ -120,51 +122,53 @@ def test_build_prefill_no_cache(self, mock_is_310p, mock_nd_to_nz_2d,
def test_build_chunked_prefill(self, mock_ascend_attention_state,
mock_is_310p, mock_nd_to_nz_spec,
mock_npu_format_cast, mock_ascend_metadata):
num_reqs = 3
num_actual_tokens = 15
max_query_len = 6

self.mock_runner.input_batch.block_table = [MagicMock()]
self.mock_runner.input_batch.block_table[
0].get_device_tensor.return_value = torch.zeros((10, 10))
self.mock_runner.max_num_blocks_per_req = 10
self.mock_runner.query_lens = torch.tensor([2, 3, 4])
self.mock_runner.seq_lens_cpu = torch.tensor([4, 5, 6])
self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
self.mock_runner.device = 'cpu:0'
self.mock_runner.attn_mask = torch.ones((15, 15))
self.mock_runner.attn_state = AscendAttentionState.ChunkedPrefill
self.mock_runner.query_start_loc_cpu = torch.tensor([0, 2, 5, 9])
common_attn_metadata = AscendCommonAttentionMetadata(
query_start_loc=torch.tensor([0, 2, 5, 9]),
query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
seq_lens_cpu=torch.tensor([4, 5, 6]),
num_reqs=3,
num_actual_tokens=15,
max_query_len=6,
decode_token_per_req=torch.tensor([1, 1, 1]),
block_table_tensor=torch.zeros((10, 10)),
slot_mapping_cpu=torch.tensor(range(20)),
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
positions=torch.tensor([10, 10]),
attn_mask=torch.ones((15, 15)),
spec_attn_mask=None,
attn_state=AscendAttentionState.ChunkedPrefill)

mock_ascend_attention_state = MagicMock()
mock_ascend_attention_state.PrefillNoCache = 0

mock_nz_tensor = MagicMock()
mock_model = MagicMock()
mock_nd_to_nz_spec.return_value = mock_nz_tensor
mock_npu_format_cast.return_value = mock_nz_tensor

self.builder.build(num_reqs, num_actual_tokens, max_query_len)
self.builder.build(common_attn_metadata, mock_model)

@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
def test_build_non_310p(self, mock_is_310p, mock_ascend_metadata):
num_reqs = 3
num_actual_tokens = 15
max_query_len = 6

self.mock_runner.input_batch.block_table = [MagicMock()]
self.mock_runner.input_batch.block_table[
0].get_device_tensor.return_value = torch.zeros((10, 10))
self.mock_runner.max_num_blocks_per_req = 10
self.mock_runner.query_lens = torch.tensor([2, 3, 4])
self.mock_runner.seq_lens_cpu = torch.tensor([4, 5, 6])
self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
self.mock_runner.device = 'cpu:0'
self.mock_runner.attn_mask = torch.ones((15, 15))
self.mock_runner.attn_state = AscendAttentionState.ChunkedPrefill
self.mock_runner.query_start_loc_cpu = torch.tensor([0, 2, 5, 9])

self.builder.build(num_reqs, num_actual_tokens, max_query_len)
common_attn_metadata = AscendCommonAttentionMetadata(
query_start_loc=torch.tensor([0, 2, 5, 9]),
query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
seq_lens_cpu=torch.tensor([4, 5, 6]),
num_reqs=3,
num_actual_tokens=15,
max_query_len=6,
decode_token_per_req=torch.tensor([1, 1, 1]),
block_table_tensor=torch.zeros((10, 10)),
slot_mapping_cpu=torch.tensor(range(20)),
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
positions=torch.tensor([10, 10]),
attn_mask=torch.ones((15, 15)),
spec_attn_mask=None,
attn_state=AscendAttentionState.ChunkedPrefill)
mock_model = MagicMock()

self.builder.build(common_attn_metadata, mock_model)


class TestAscendAttentionBackendImpl(TestBase):
Expand Down
Loading
Loading