⚡️ Speed up function `get_weight_perm` by 13% #328

codeflash-ai · 2025-11-07T14:12:17Z

📄 13% (0.13x) speedup for `get_weight_perm` in `python/sglang/srt/layers/quantization/moe_wna16.py`

⏱️ Runtime : 5.90 milliseconds → 5.21 milliseconds (best of 215 runs)

📝 Explanation and details

Summary of optimizations:

Used bitwise operations (>> 2, & 3) for integer division and modulo to speed up inner loops.
Hoisted creation of commonly used interleave arrays for reuse and faster access.
Precomputed rows for both blocks to avoid recomputation.
Avoided list allocation in the innermost loop by using local variable assignment (extend = perm_list.extend).
Used .flatten() instead of .ravel() as it's slightly faster and avoids subtle memory issues for contiguous arrays.
No change in exceptions, return types, output, or mutation behavior.
Preserved all comments and variable names exactly as given.

✅ Correctness verification report:

Test	Status
⚙️ Existing Unit Tests	🔘 None Found
🌀 Generated Regression Tests	✅ 49 Passed
⏪ Replay Tests	🔘 None Found
🔎 Concolic Coverage Tests	🔘 None Found
📊 Tests Coverage	100.0%

🌀 Generated Regression Tests and Runtime

from __future__ import annotations

from typing import List

import numpy as np
# imports
import pytest  # used for our unit tests
import torch
from sglang.srt.layers.quantization.moe_wna16 import get_weight_perm

# unit tests

# -------------------------------
# Basic Test Cases
# -------------------------------

def test_basic_num_bits_4_type_and_shape():
    # Test that output is a torch.Tensor of correct shape for num_bits=4
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 144μs -> 131μs (9.50% faster)

def test_basic_num_bits_8_type_and_shape():
    # Test that output is a torch.Tensor of correct shape for num_bits=8
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 132μs -> 117μs (13.0% faster)

def test_basic_num_bits_4_content_range():
    # Test that all indices are in the expected range for num_bits=4
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 130μs -> 115μs (13.0% faster)
    # Should be a permutation of numbers in range(0, 2048)
    perm_set = set(perm.tolist())

def test_basic_num_bits_8_content_range():
    # Test that all indices are in the expected range for num_bits=8
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 129μs -> 113μs (14.0% faster)
    perm_set = set(perm.tolist())

def test_basic_num_bits_4_permutation_property():
    # Test that output is a permutation (no repeats, all present)
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 126μs -> 114μs (10.3% faster)
    seen = set()
    for idx in perm.tolist():
        seen.add(idx)

def test_basic_num_bits_8_permutation_property():
    # Test that output is a permutation (no repeats, all present)
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 128μs -> 113μs (12.7% faster)
    seen = set()
    for idx in perm.tolist():
        seen.add(idx)

# -------------------------------
# Edge Test Cases
# -------------------------------

def test_invalid_num_bits_raises():
    # Test that invalid num_bits raises Exception
    with pytest.raises(Exception) as excinfo:
        get_weight_perm(1) # 110μs -> 95.9μs (15.5% faster)
    with pytest.raises(Exception) as excinfo:
        get_weight_perm(16) # 105μs -> 90.3μs (16.8% faster)
    with pytest.raises(Exception) as excinfo:
        get_weight_perm(0) # 103μs -> 91.1μs (13.9% faster)
    with pytest.raises(Exception) as excinfo:
        get_weight_perm(-4) # 105μs -> 88.5μs (18.7% faster)


def test_num_bits_as_str():
    # Test that passing string values raises Exception
    with pytest.raises(Exception) as excinfo:
        get_weight_perm("4") # 117μs -> 103μs (13.1% faster)
    with pytest.raises(Exception) as excinfo:
        get_weight_perm("eight") # 105μs -> 90.6μs (15.9% faster)

def test_permutation_uniqueness_and_sortedness():
    # Test that permutation is not sorted (should be scrambled)
    codeflash_output = get_weight_perm(4); perm4 = codeflash_output # 139μs -> 124μs (12.2% faster)
    codeflash_output = get_weight_perm(8); perm8 = codeflash_output # 111μs -> 97.2μs (15.2% faster)

def test_permutation_dtype():
    # Test that output dtype is torch.int64 (default for torch.from_numpy)
    codeflash_output = get_weight_perm(4); perm4 = codeflash_output # 127μs -> 111μs (14.6% faster)
    codeflash_output = get_weight_perm(8); perm8 = codeflash_output # 109μs -> 95.5μs (14.8% faster)

# -------------------------------
# Large Scale Test Cases
# -------------------------------

def test_large_scale_memory_usage_and_permutation():
    # Test that the function does not exceed 100MB and output is correct for both num_bits
    # 2048 elements * 8 bytes (int64) = 16KB << 100MB, so this is safe
    codeflash_output = get_weight_perm(4); perm4 = codeflash_output # 125μs -> 113μs (10.1% faster)
    codeflash_output = get_weight_perm(8); perm8 = codeflash_output # 110μs -> 95.0μs (15.8% faster)

def test_large_scale_random_spot_checks():
    # Spot check a few known positions for determinism and stability
    codeflash_output = get_weight_perm(4); perm4 = codeflash_output # 124μs -> 112μs (10.6% faster)
    codeflash_output = get_weight_perm(8); perm8 = codeflash_output # 109μs -> 96.5μs (13.5% faster)
    # Check that indices are not repeated
    seen4 = set()
    for idx in [perm4[0].item(), perm4[1023].item(), perm4[-1].item()]:
        seen4.add(idx)
    seen8 = set()
    for idx in [perm8[0].item(), perm8[511].item(), perm8[-1].item()]:
        seen8.add(idx)

def test_large_scale_permutation_consistency():
    # Ensure that repeated calls yield the same permutation (determinism)
    codeflash_output = get_weight_perm(4); perm4a = codeflash_output # 128μs -> 112μs (14.2% faster)
    codeflash_output = get_weight_perm(4); perm4b = codeflash_output # 109μs -> 94.6μs (16.1% faster)
    codeflash_output = get_weight_perm(8); perm8a = codeflash_output # 109μs -> 92.7μs (18.0% faster)
    codeflash_output = get_weight_perm(8); perm8b = codeflash_output # 107μs -> 94.0μs (14.4% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
from __future__ import annotations

from typing import List

import numpy as np
# imports
import pytest  # used for our unit tests
import torch
from sglang.srt.layers.quantization.moe_wna16 import get_weight_perm

# unit tests

# -------------------------
# Basic Test Cases
# -------------------------

def test_perm_type_and_dtype_4bit():
    """Test that the function returns a torch tensor of the correct dtype for 4 bits."""
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 125μs -> 113μs (10.6% faster)

def test_perm_type_and_dtype_8bit():
    """Test that the function returns a torch tensor of the correct dtype for 8 bits."""
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 126μs -> 113μs (11.4% faster)

def test_perm_shape_4bit():
    """Test the shape of the permutation tensor for 4 bits."""
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 125μs -> 112μs (11.6% faster)

def test_perm_shape_8bit():
    """Test the shape of the permutation tensor for 8 bits."""
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 127μs -> 113μs (12.1% faster)

def test_perm_values_are_unique_4bit():
    """Test that all values in the permutation tensor are unique for 4 bits."""
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 125μs -> 114μs (9.79% faster)
    unique = set(perm.tolist())

def test_perm_values_are_unique_8bit():
    """Test that all values in the permutation tensor are unique for 8 bits."""
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 125μs -> 113μs (10.4% faster)
    unique = set(perm.tolist())

def test_perm_values_are_integers_4bit():
    """Test that all values in the permutation tensor are integers for 4 bits."""
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 127μs -> 112μs (13.1% faster)
    for val in perm.tolist():
        pass

def test_perm_values_are_integers_8bit():
    """Test that all values in the permutation tensor are integers for 8 bits."""
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 126μs -> 113μs (12.3% faster)
    for val in perm.tolist():
        pass

# -------------------------
# Edge Test Cases
# -------------------------

def test_invalid_num_bits_zero():
    """Test that passing 0 as num_bits raises an Exception."""
    with pytest.raises(Exception) as excinfo:
        get_weight_perm(0) # 110μs -> 96.3μs (14.2% faster)

def test_invalid_num_bits_negative():
    """Test that passing a negative num_bits raises an Exception."""
    with pytest.raises(Exception) as excinfo:
        get_weight_perm(-4) # 113μs -> 97.1μs (16.7% faster)

def test_invalid_num_bits_large():
    """Test that passing a large invalid num_bits raises an Exception."""
    with pytest.raises(Exception) as excinfo:
        get_weight_perm(32) # 108μs -> 93.6μs (15.5% faster)

def test_invalid_num_bits_non_integer():
    """Test that passing a non-integer num_bits raises an Exception."""
    with pytest.raises(Exception) as excinfo:
        get_weight_perm("4") # 109μs -> 96.9μs (12.6% faster)


def test_perm_range_4bit():
    """Test that the permutation for 4 bits covers the range 0..1023 exactly."""
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 142μs -> 126μs (12.2% faster)
    expected = set(range(1000))
    actual = set(perm.tolist())

def test_perm_range_8bit():
    """Test that the permutation for 8 bits covers the range 0..511 exactly."""
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 133μs -> 120μs (10.8% faster)
    expected = set(range(512))
    actual = set(perm.tolist())

def test_perm_is_1d_tensor_4bit():
    """Test that the returned permutation tensor is 1-dimensional for 4 bits."""
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 128μs -> 116μs (10.3% faster)

def test_perm_is_1d_tensor_8bit():
    """Test that the returned permutation tensor is 1-dimensional for 8 bits."""
    codeflash_output = get_weight_perm(8); perm = codeflash_output # 128μs -> 115μs (11.2% faster)

# -------------------------
# Large Scale Test Cases
# -------------------------

def test_large_scale_multiple_calls():
    """Test calling the function multiple times in succession to check for memory leaks or state retention."""
    perms_4 = [get_weight_perm(4) for _ in range(100)] # 129μs -> 113μs (14.1% faster)
    perms_8 = [get_weight_perm(8) for _ in range(100)] # 111μs -> 95.6μs (17.0% faster)
    # Check that all returned tensors are unique objects (not the same reference)
    for i in range(1, 100):
        pass
    # Check that all tensors have the correct shape
    for perm in perms_4:
        pass
    for perm in perms_8:
        pass

def test_large_scale_permutation_correctness():
    """Test that the permutation is correct under repeated calls and does not change."""
    codeflash_output = get_weight_perm(4); perm1_4 = codeflash_output # 138μs -> 126μs (9.73% faster)
    codeflash_output = get_weight_perm(4); perm2_4 = codeflash_output # 109μs -> 95.0μs (15.4% faster)
    codeflash_output = get_weight_perm(8); perm1_8 = codeflash_output # 108μs -> 94.0μs (15.8% faster)
    codeflash_output = get_weight_perm(8); perm2_8 = codeflash_output # 107μs -> 92.5μs (16.7% faster)

def test_large_scale_memory_usage():
    """Test that the function does not allocate more than 100MB of memory for its output."""
    codeflash_output = get_weight_perm(4); perm_4 = codeflash_output # 125μs -> 110μs (13.7% faster)
    codeflash_output = get_weight_perm(8); perm_8 = codeflash_output # 108μs -> 91.3μs (18.6% faster)

def test_large_scale_values_distribution_4bit():
    """Test that the permutation for 4 bits is a true permutation (all indices from 0 to 1023 exactly once)."""
    codeflash_output = get_weight_perm(4); perm = codeflash_output # 126μs -> 111μs (13.3% faster)
    seen = [0] * 1024
    for val in perm.tolist():
        seen[val] += 1
    for idx, count in enumerate(seen):
        pass

To edit these changes git checkout codeflash/optimize-get_weight_perm-mhoxpv9a and push.

**Summary of optimizations:** - Used bitwise operations (`>> 2`, `& 3`) for integer division and modulo to speed up inner loops. - Hoisted creation of commonly used `interleave` arrays for reuse and faster access. - Precomputed rows for both blocks to avoid recomputation. - Avoided list allocation in the innermost loop by using local variable assignment (`extend = perm_list.extend`). - Used `.flatten()` instead of `.ravel()` as it's slightly faster and avoids subtle memory issues for contiguous arrays. - No change in exceptions, return types, output, or mutation behavior. - Preserved all comments and variable names exactly as given.

codeflash-ai bot requested a review from mashraf-222 November 7, 2025 14:12

codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label Nov 7, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

⚡️ Speed up function `get_weight_perm` by 13% #328

⚡️ Speed up function `get_weight_perm` by 13% #328

Uh oh!

codeflash-ai bot commented Nov 7, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

⚡️ Speed up function get_weight_perm by 13% #328

Are you sure you want to change the base?

⚡️ Speed up function get_weight_perm by 13% #328

Uh oh!

Conversation

codeflash-ai bot commented Nov 7, 2025

📄 13% (0.13x) speedup for get_weight_perm in python/sglang/srt/layers/quantization/moe_wna16.py

📝 Explanation and details

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

⚡️ Speed up function `get_weight_perm` by 13% #328

⚡️ Speed up function `get_weight_perm` by 13% #328

📄 13% (0.13x) speedup for `get_weight_perm` in `python/sglang/srt/layers/quantization/moe_wna16.py`