Merge branch 'main' into kylesayrs/calib

dsikka · web-flow · commit 3fdeee8ce2ec · 2025-09-12T14:52:28.000-04:00
diff --git a/.github/workflows/linkcheck.yml b/.github/workflows/linkcheck.yml
@@ -2,11 +2,9 @@ name: Check Markdown links
 
 on:
   push:
-    branches:
-      - main
+    branches: [ main, 'release/*' ]
   pull_request:
-    branches:
-      - main
+    branches: [ main, 'release/*' ]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
diff --git a/.github/workflows/quality-check.yaml b/.github/workflows/quality-check.yaml
@@ -1,13 +1,9 @@
 name: Quality Checks
 on:
   push:
-    branches:
-      - main
-      - 'release/*'
+    branches: [ main , 'release/*' ]
   pull_request:
-    branches:
-      - main
-      - 'release/*'
+    branches: [ main, 'release/*' ]
 jobs:
   quality-check:
     runs-on: ubuntu-22.04
diff --git a/.github/workflows/set-comment.yaml b/.github/workflows/set-comment.yaml
@@ -1,7 +1,7 @@
 name: PR Reminder Comment Bot
 on:
   pull_request_target:
-    branches: [main]
+    branches: [ main, 'release/*' ]
     types: [opened]
 
 jobs:
diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml
@@ -1,10 +1,10 @@
 name: Test Checks (Transformers)
 on:
   pull_request:
-    branches: [ main ]
+    branches: [ main, 'release/*' ]
     types: [ labeled, synchronize ]
   push:
-    branches: [ main ]
+    branches: [ main, 'release/*' ]
   workflow_dispatch:
     inputs:
       code_coverage:
diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml
@@ -1,9 +1,9 @@
 name: Test Checks (Base/PyTorch)
 on:
   pull_request:
-    branches:
-      - main
+    branches: [ main, 'release/*' ]
   push:
+    branches: [ main, 'release/*' ]
   workflow_dispatch:
     inputs:
       code_coverage:
@@ -115,59 +115,9 @@ jobs:
         run: |
           coverage report --data-file="$COVERAGE_FILE" --skip-empty --format="markdown" > "$GITHUB_STEP_SUMMARY"
 
-  compat-pytorch-1_9-pytorch-tests:
-    runs-on: ubuntu-22.04
-    env:
-      COVERAGE_FILE: ".coverage.compat-pytorch-1.9"
-    steps:
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: "⚙️ Install dependencies"
-        run: pip3 install -U pip setuptools && pip3 install .[dev]
-      - uses: actions/checkout@v4
-        with:
-          repository: "neuralmagic/compressed-tensors"
-          path: "compressed-tensors"
-          fetch-depth: 0
-          fetch-tags: true
-      - name: "⚙️ Install compressed-tensors dependencies"
-        run: |
-          pip3 uninstall -y compressed-tensors
-          export GIT_CEILING_DIRECTORIES="$(pwd)"
-          cd compressed-tensors
-          BUILD_TYPE=nightly pip3 install .
-      - name: "Clean compressed-tensors directory"
-        run: rm -r compressed-tensors/
-      - name: "⚙️ Prepare code coverage"
-        if: inputs.code_coverage
-        uses: ./.github/actions/prepare-code-coverage
-      - name: "🔬 Running pytorch tests"
-        run: |
-          pytest -v tests/llmcompressor/pytorch
-      - name: "Upload coverage report"
-        if: (success() || failure()) && inputs.code_coverage
-        uses: actions/upload-artifact@v4
-        with:
-          name: compat-pytorch-tests-coverage-results
-          path: |
-            .coverage*
-            coverage-html
-            coverage.json
-          include-hidden-files: true
-          retention-days: 5
-      - name: "Report coverage"
-        if: (success() || failure()) && inputs.code_coverage
-        run: |
-          coverage report --data-file="$COVERAGE_FILE" --skip-empty --format="markdown" > "$GITHUB_STEP_SUMMARY"
-
   combine-coverage:
     runs-on: ubuntu-22.04
-    needs: [base-tests, pytorch-tests, compat-pytorch-1_9-pytorch-tests]
+    needs: [base-tests, pytorch-tests]
     if: (success() || failure()) && inputs.code_coverage
     steps:
       - name: "Checkout llm-compressor"
diff --git a/Makefile b/Makefile
@@ -25,6 +25,7 @@ endif
 quality:
 	@echo "Running python quality checks";
 	ruff check $(CHECKDIRS);
+	ruff format --check $(CHECKDIRS);
 	isort --check-only $(CHECKDIRS);
 	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203,W605;
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ skip = ["src/llmcompressor/transformers/tracing/", "src/llmcompressor/version.py
 files = "src/guidellm"
 
 [tool.ruff]
-exclude = ["build", "dist", "env", ".venv", "src/llmcompressor/transformers/tracing/"]
+exclude = ["build", "dist", "env", ".venv", "src/llmcompressor/transformers/tracing/", "src/llmcompressor/version.py"]
 lint.select = ["E", "F", "W"]
 lint.extend-ignore = ["E203", "W605"]
 
diff --git a/setup.py b/setup.py
@@ -110,63 +110,31 @@ def localversion_func(version: ScmVersion) -> str:
         "src", include=["llmcompressor", "llmcompressor.*"], exclude=["*.__pycache__.*"]
     ),
     install_requires=[
-        (
-            "loguru>=0.7.2,<=0.7.3"
-            if BUILD_TYPE == "release"
-            else "loguru>=0.7.2"
-        ),
-        (
-            "pyyaml>=6.0.1,<=6.0.2"
-            if BUILD_TYPE == "release"
-            else "pyyaml>=6.0.1"
-        ),
+        ("loguru>=0.7.2,<=0.7.3" if BUILD_TYPE == "release" else "loguru>=0.7.2"),
+        ("pyyaml>=6.0.1,<=6.0.2" if BUILD_TYPE == "release" else "pyyaml>=6.0.1"),
         # librosa dependency numba is currently not compatible with numpy>=2.3
         # https://numba.readthedocs.io/en/stable/user/installing.html#version-support-information
-        (
-            "numpy>=2.0.0,<=2.3.2"
-            if BUILD_TYPE == "release"
-            else "numpy>=2.0.0"
-        ),
+        ("numpy>=2.0.0,<=2.3.2" if BUILD_TYPE == "release" else "numpy>=2.0.0"),
         (
             "requests>=2.32.2,<=2.32.5"
             if BUILD_TYPE == "release"
             else "requests>=2.32.2"
         ),
-        (
-            "tqdm>=4.66.3,<=4.67.1"
-            if BUILD_TYPE == "release"
-            else "tqdm>=4.66.3"
-        ),
-        (
-            "torch>=2.7.0,<=2.8.0"
-            if BUILD_TYPE == "release"
-            else "torch>=2.7.0"
-        ),
+        ("tqdm>=4.66.3,<=4.67.1" if BUILD_TYPE == "release" else "tqdm>=4.66.3"),
+        ("torch>=2.7.0,<=2.8.0" if BUILD_TYPE == "release" else "torch>=2.7.0"),
         (
             "transformers>=4.53.0,<=4.55.2"
             if BUILD_TYPE == "release"
             else "transformers>=4.53.0"
         ),
-        (
-            "datasets>=4.0.0,<=4.0.0"
-            if BUILD_TYPE == "release"
-            else "datasets>=4.0.0"
-        ),
+        ("datasets>=4.0.0,<=4.0.0" if BUILD_TYPE == "release" else "datasets>=4.0.0"),
         (
             "accelerate>=1.6.0,<=1.10.0"
             if BUILD_TYPE == "release"
             else "accelerate>=1.6.0"
         ),
-        (
-            "pynvml>=11.5.3,<=12.0.0"
-            if BUILD_TYPE == "release"
-            else "pynvml>=11.5.3"
-        ),
-        (
-            "pillow>=10.4.0,<=10.4.0"
-            if BUILD_TYPE == "release"
-            else "pillow>=10.4.0"
-        ),
+        ("pynvml>=11.5.3,<=12.0.0" if BUILD_TYPE == "release" else "pynvml>=11.5.3"),
+        ("pillow>=10.4.0,<=10.4.0" if BUILD_TYPE == "release" else "pillow>=10.4.0"),
         (
             "compressed-tensors==0.11.0"
             if BUILD_TYPE == "release"
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -72,8 +72,9 @@ class GPTQModifier(Modifier, QuantizationMixin):
     :param block_size: Used to determine number of columns to compress in one pass
     :param dampening_frac: Amount of dampening to apply to H, as a fraction of the
         diagonal norm
-    :param actorder: order in which weight columns are quantized. For more information,
-        on actorder options, see https://github.com/vllm-project/vllm/pull/8135
+    :param actorder: order in which weight columns are quantized. Defaults to "static"
+        activation ordering, which achieves best accuracy recovery with no runtime cost.
+        For more information, see https://github.com/vllm-project/vllm/pull/8135
     :param offload_hessians: Set to True for decreased memory usage but increased
         runtime.
 
@@ -106,7 +107,7 @@ class GPTQModifier(Modifier, QuantizationMixin):
     sequential_targets: Union[str, List[str], None] = None
     block_size: int = 128
     dampening_frac: Optional[float] = 0.01
-    actorder: Optional[Union[ActivationOrdering, Sentinel]] = None
+    actorder: Optional[Union[ActivationOrdering, Sentinel]] = Sentinel("static")
     offload_hessians: bool = False
 
     # private variables
@@ -134,18 +135,17 @@ def resolve_actorder(existing):
                 return ActivationOrdering.STATIC if existing is None else existing
 
             # user-provided value always attempts to override
-            if self.actorder is not None:
-                if existing is None or self.actorder == existing:
-                    return self.actorder
-                raise ValueError(
-                    "Cannot resolve activation ordering when both "
-                    "`GPTQModifier.actorder` and `QuantizationScheme.actorder` "
-                    "are provided and differ. Either set `GPTQModifier.actorder = "
-                    "None` or remove `actorder` from config groups."
-                )
+            if existing is None or self.actorder == existing:
+                return self.actorder
 
-            # setting `GPTQModifier.actorder = None` does nothing
-            return existing
+            # if existing provided and conflicts
+            raise ValueError(
+                "Cannot resolve activation ordering when both "
+                "`GPTQModifier.actorder` and `QuantizationScheme.actorder` "
+                f"are provided and differ ({self.actorder}, {existing}). "
+                "Either unset `GPTQModifier.actorder` or "
+                "remove `actorder` from config groups."
+            )
 
         for scheme in config.config_groups.values():
             assert isinstance(scheme, QuantizationScheme)
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
@@ -70,7 +70,10 @@ def data_collator(batch):
         # a compatible preset sceme
         if quant_type == "GPTQ":
             oneshot_kwargs["recipe"] = GPTQModifier(
-                targets="Linear", scheme=scheme, ignore=["lm_head"]
+                targets="Linear",
+                scheme=scheme,
+                actorder=None,  # added for consistency with past testing configs
+                ignore=["lm_head"],
             )
         else:
             oneshot_kwargs["recipe"] = QuantizationModifier(
diff --git a/tests/e2e/vLLM/configs/w4a16_actorder_none.yaml b/tests/e2e/vLLM/configs/w4a16_actorder_none.yaml
@@ -0,0 +1,9 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
+dataset_id: openai/gsm8k
+dataset_config: main
+dataset_split: train
+scheme: W4A16_actorder_none
+save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group
diff --git a/tests/e2e/vLLM/configs/w4a16_actorder_none_qwen.yaml b/tests/e2e/vLLM/configs/w4a16_actorder_none_qwen.yaml
@@ -0,0 +1,8 @@
+cadence: "nightly"
+test_type: "regression"
+model: Qwen/Qwen2.5-0.5B
+recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
+dataset_id: neuralmagic/LLM_compression_calibration
+dataset_split: train
+scheme: W4A16_actorder_none
+save_dir: Qwen2.5-0.5B-actorder-none
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
@@ -4,6 +4,7 @@ quant_stage:
       smoothing_strength: 0.8
     GPTQModifier:
       ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*", "re:model.visual.*"]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml
@@ -4,6 +4,7 @@ quant_stage:
       smoothing_strength: 0.8
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml
@@ -9,6 +9,7 @@ quant_stage:
         - re:.*post_attention_layernorm
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml
@@ -4,6 +4,7 @@ quant_stage:
       smoothing_strength: 0.8
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml
@@ -2,6 +2,7 @@ quant_stage:
   quant_modifiers:
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml
@@ -2,6 +2,7 @@ quant_stage:
   quant_modifiers:
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, dynamic: false}
diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml
@@ -11,6 +11,7 @@ quantization_stage:
   quantization_modifiers:
     GPTQModifier:
       ignore: ["lm_head"]
+      actorder: null
       config_groups:
         group_0:
           weights:
diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml
@@ -11,6 +11,7 @@ quantization_stage:
   quantization_modifiers:
     GPTQModifier:
       ignore: ["lm_head"]
+      actorder: null
       config_groups:
         group_0:
           weights:
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
@@ -2,6 +2,7 @@ quant_stage:
   quant_modifiers:
     GPTQModifier:
       ignore: ["lm_head"]
+      actorder: "group"
       config_groups:
         group_0:
           weights:
@@ -10,5 +11,4 @@ quant_stage:
             symmetric: true
             strategy: "group"
             group_size: 128
-            actorder: "group"
           targets: ["Linear"]
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
@@ -0,0 +1,14 @@
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      ignore: ["lm_head"]
+      actorder: null
+      config_groups:
+        group_0:
+          weights:
+            num_bits: 4
+            type: "int"
+            symmetric: true
+            strategy: "group"
+            group_size: 128
+          targets: ["Linear"]
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
diff --git a/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml b/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml
diff --git a/tests/e2e/vLLM/run_vllm.py b/tests/e2e/vLLM/run_vllm.py
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
diff --git a/tests/llmcompressor/modifiers/quantization/test_base.py b/tests/llmcompressor/modifiers/quantization/test_base.py
diff --git a/tests/lmeval/configs/w4a16_actorder_none.yaml b/tests/lmeval/configs/w4a16_actorder_none.yaml