NVIDIA
diff --git a/‎examples/llm_eval/lm_eval_hf.py‎
Lines changed: 14 additions & 0 deletions b/‎examples/llm_eval/lm_eval_hf.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/llm_eval/mmlu.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/llm_eval/mmlu.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/llm_eval/quantization_utils.py‎
Lines changed: 37 additions & 7 deletions b/‎examples/llm_eval/quantization_utils.py‎
Lines changed: 37 additions & 7 deletions
diff --git a/‎modelopt/torch/quantization/algorithms.py‎
Lines changed: 170 additions & 2 deletions b/‎modelopt/torch/quantization/algorithms.py‎
Lines changed: 170 additions & 2 deletions
@@ -53,6 +53,7 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
 
     quant_cfg = arg_dict.pop("quant_cfg", None)
     auto_quantize_bits = arg_dict.pop("auto_quantize_bits", None)
+    auto_quantize_method = arg_dict.pop("auto_quantize_method", "gradient")
     calib_batch_size = arg_dict.pop("calib_batch_size", None)
     calib_size = arg_dict.pop("calib_size", 512)
     compress = arg_dict.pop("compress", False)
@@ -81,6 +82,7 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
             batch_size=calib_batch_size,
             calib_size=calib_size,
             auto_quantize_bits=auto_quantize_bits,
+            auto_quantize_method=auto_quantize_method,
             test_generated=False,
             compress=compress,
         )
@@ -109,6 +111,17 @@ def setup_parser_with_modelopt_args():
             "regular quantization will be applied."
         ),
     )
+    parser.add_argument(
+        "--auto_quantize_method",
+        type=str,
+        default="gradient",
+        choices=["gradient", "kl_div"],
+        help=(
+            "Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
+            "(requires labels in dataset). 'kl_div' uses KL divergence between original and "
+            "quantized model outputs (no labels required). Default: 'gradient'"
+        ),
+    )
     parser.add_argument(
         "--calib_batch_size", type=int, help="Batch size for quantization calibration"
     )
@@ -139,6 +152,7 @@ def setup_parser_with_modelopt_args():
         {
             "quant_cfg": args.quant_cfg,
             "auto_quantize_bits": args.auto_quantize_bits,
+            "auto_quantize_method": args.auto_quantize_method,
             "calib_batch_size": args.calib_batch_size,
             "calib_size": args.calib_size,
             "compress": args.compress,
 
@@ -224,6 +224,7 @@ def main(
     ntrain: int = 5,
     quant_cfg: str | None = None,
     auto_quantize_bits: float | None = None,
+    auto_quantize_method: str = "gradient",
     batch_size: int = 0,
     calib_size: int = 512,
     dtype: str = "bfloat16",
@@ -281,6 +282,7 @@ def main(
                     batch_size=batch_size,
                     calib_size=calib_size,
                     auto_quantize_bits=auto_quantize_bits,
+                    auto_quantize_method=auto_quantize_method,
                 )
 
     for subject in tqdm(subjects):
 
@@ -66,6 +66,7 @@ def _quantize_model_with_dataset(
     quant_cfg: str | list[str],
     calib_dataset,
     auto_quantize_bits=None,
+    auto_quantize_method="gradient",
     batch_size=1,
     compress=False,
 ):
@@ -81,23 +82,41 @@ def _quantize_model_with_dataset(
             getattr(mtq, quant_fmt) for quant_fmt in quant_cfg if quant_fmt != "NONE"
         ]
 
-        def loss_func(output, data):
-            # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
-            # which contains the loss attribute.
-            return output.loss
+        # Configure forward_step and loss_func based on method
+        if auto_quantize_method == "gradient":
+            # For gradient-based method, return full output with loss
+            def forward_step(model, batch):
+                return model(**batch)
+
+            def loss_func(output, data):
+                # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
+                # which contains the loss attribute.
+                return output.loss
+        elif auto_quantize_method == "kl_div":
+            # For KL divergence method, return only logits
+            def forward_step(model, batch):
+                return model(**batch).logits
+
+            loss_func = None  # KL divergence doesn't need a custom loss function
+        else:
+            raise ValueError(
+                f"Invalid auto_quantize_method: {auto_quantize_method}. "
+                "Must be 'gradient' or 'kl_div'"
+            )
 
         net, _ = mtq.auto_quantize(
             net,
             constraints={"effective_bits": auto_quantize_bits},
             quantization_formats=quant_cfg_for_search,
             data_loader=calib_dataset,
-            forward_step=lambda model, batch: model(**batch),
+            forward_step=forward_step,
             loss_func=loss_func,
             num_calib_steps=len(calib_dataset),
             num_score_steps=min(
                 len(calib_dataset), 128 // batch_size
             ),  # Limit the number of score steps to avoid long calibration time
             verbose=True,
+            method=auto_quantize_method,
         )
     else:
         mtq_cfg = CUSTOM_CONFIG.get(quant_cfg)  # type: ignore [arg-type]
@@ -142,6 +161,7 @@ def quantize_model(
     batch_size,
     calib_size,
     auto_quantize_bits=None,
+    auto_quantize_method="gradient",
     data="cnn_dailymail",
     test_generated=True,
     compress=False,
@@ -156,6 +176,7 @@ def quantize_model(
         batch_size: the calibration batch size for each calibration inference run.
         calib_size: the total calibration dataset size.
         auto_quantize_bits: The effective bits constraint for auto_quantize.
+        auto_quantize_method: The method for auto_quantize ('gradient' or 'kl_div').
         data: the name of the calibration dataset.
         test_generated:  If ``True``, test the generated text before and after quantization.
         compress: If ``True``, compress the model after quantization.
@@ -180,21 +201,30 @@ def quantize_model(
         batch_size = get_max_batch_size(net)
         print(f"Update calib batch {batch_size}")
 
+    # Labels are only needed for gradient-based auto_quantize
+    include_labels = auto_quantize_bits is not None and auto_quantize_method == "gradient"
+
     calib_dataloader = get_dataset_dataloader(
         dataset_name=data,
         tokenizer=tokenizer,
         batch_size=batch_size,
         num_samples=calib_size,
         device=device,
-        include_labels=auto_quantize_bits is not None,
+        include_labels=include_labels,
     )
 
     if test_generated:
         input_str = tokenizer.decode(next(iter(calib_dataloader))["input_ids"][0])
         generated_str_before_ptq = model.run(input_str)
 
     _quantize_model_with_dataset(
-        model, quant_cfg, calib_dataloader, auto_quantize_bits, batch_size, compress
+        model,
+        quant_cfg,
+        calib_dataloader,
+        auto_quantize_bits,
+        auto_quantize_method,
+        batch_size,
+        compress,
     )
 
     if test_generated:
 
@@ -28,6 +28,7 @@
 import regex as re
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from tqdm import tqdm
 
 from modelopt.torch.opt.conversion import ModeloptStateManager
@@ -943,8 +944,175 @@ def run_search_with_stats(self, max_weight_size, verbose=False):
         return best_recipes, is_satisfied
 
 
-class AutoQuantizeLossSearcher(_AutoQuantizeBaseSearcher):
-    """A searcher for AutoQuantize algorithm that uses loss based score estimation."""
+@torch.compile
+def _get_kl_div_loss(logits_unquant: torch.Tensor, logits_quant: torch.Tensor) -> torch.Tensor:
+    # TODO: Support TensorParallel
+    prob_unquant = F.softmax(logits_unquant, dim=-1)
+    log_prob_quant = F.log_softmax(logits_quant, dim=-1)
+    return F.kl_div(log_prob_quant, prob_unquant, reduction="sum", log_target=False)
+
+
+class AutoQuantizeKLDivSearcher(_AutoQuantizeBaseSearcher):
+    """A searcher for AutoQuantize algorithm that uses KL-Divergence loss based score estimation."""
+
+    score_module_rules: list[str | Callable] = [lambda name: ""]
+
+    @property
+    def default_search_config(self):
+        """Get the default config for the searcher."""
+        config = super().default_search_config
+        config.update(
+            {
+                "forward_step": None,
+            }
+        )
+        return config
+
+    def sanitize_search_config(self, config: SearchConfig | None) -> SearchConfig:
+        """Sanitize the search config dict."""
+        config = config or {}
+        for ignored_key in ["score_func", "loss_func", "forward_backward_step"]:
+            if ignored_key in config:
+                warnings.warn(
+                    f"`{ignored_key}` is ignored for KL-Divergence loss based `auto_quantize`."
+                )
+                config.pop(ignored_key)
+        config = super().sanitize_search_config(config)
+        assert config["forward_step"] is not None, (
+            "`forward_step` must be provided for KL-Divergence loss based `auto_quantize`. "
+            "`forward_step(model, data)` should return model logits."
+        )
+        return config
+
+    @torch.no_grad()
+    def estimate_sensitivity_scores(self):
+        """Estimate the sensitivity scores for the model.
+
+        Higher score means more sensitive to quantization.
+        """
+        # Check if tensor parallelism is being used
+        for name, module in self.model.named_modules():
+            if hasattr(module, "parallel_state"):
+                if hasattr(module.parallel_state, "tensor_parallel_group"):
+                    if module.parallel_state.tensor_parallel_group.is_initialized():
+                        warnings.warn(
+                            "Tensor Parallel is not supported for KL-Divergence based auto_quantize. "
+                        )
+                        break
+
+        def set_to_unquantized():
+            for name, hparam in named_hparams(self.model, unique=True):
+                if not isinstance(hparam, QuantRecipeHparam):
+                    continue
+                if hparam.is_configurable:
+                    hparam.active = QuantRecipe(quant_cfg=None)
+
+        self.model.eval()
+        num_iters = self.config["num_score_steps"]
+        for _, data in tqdm(
+            zip(range(num_iters), self.config["data_loader"]),
+            desc="Estimating KLDivergence loss",
+            total=num_iters,
+        ):
+            set_to_unquantized()
+            logits_unquant = self.config["forward_step"](self.model, data)
+
+            for name, hparam in named_hparams(self.model, configurable=True):
+                if not isinstance(hparam, QuantRecipeHparam):
+                    continue
+                for recipe in hparam.choices:
+                    if recipe == QuantRecipe(quant_cfg=None):
+                        continue
+                    hparam.active = recipe
+                    logits_quant = self.config["forward_step"](self.model, data)
+                    score = _get_kl_div_loss(logits_unquant, logits_quant)
+                    hparam._importance_dict[recipe][hparam.score_modules[0]] = score
+                hparam.active = QuantRecipe(quant_cfg=None)
+
+    def run_search_with_stats(self, max_weight_size, verbose=False):
+        """Run threshold-based binary search for KLDivergence loss based auto_quantize.
+
+        We use binary search to minimize the max(per-layer score) while meeting the constraint.
+        """
+        # Collect all sensitivity scores to determine initial threshold bounds
+        all_scores = [
+            score for name in self.candidate_stats for score in self.candidate_stats[name]["scores"]
+        ]
+
+        if not all_scores:
+            warnings.warn("No scores available for threshold-based search!")
+            is_satisfied = False
+            return {}, is_satisfied
+
+        # Initialize binary search bounds
+        min_score = min(all_scores)
+        max_score = max(all_scores)
+        threshold = (min_score + max_score) / 2.0
+        lower_bound = min_score
+        upper_bound = max_score
+
+        # Run for fixed number of iterations
+        max_iterations = 100
+
+        if verbose:
+            print_rank_0("AutoQuantize: Starting threshold-based binary search")
+            print_rank_0(f"  Score range: [{min_score:.6e}, {max_score:.6e}]")
+            print_rank_0(f"  Target weight size: {max_weight_size:.2f}")
+
+        for iteration in range(max_iterations):
+            # Select recipes based on current threshold
+            best_recipes = {}
+            total_weight_size = 0.0
+
+            for name in self.candidate_stats:
+                formats = self.candidate_stats[name]["formats"]
+                scores = self.candidate_stats[name]["scores"]
+                costs = self.candidate_stats[name]["costs"]
+
+                selected_idx = 0
+                for idx in range(len(formats)):
+                    if scores[idx] <= threshold:
+                        selected_idx = idx
+                        break
+
+                best_recipes[name] = {
+                    "format": formats[selected_idx],
+                    "costs": costs[selected_idx],
+                    "scores": scores[selected_idx],
+                }
+                total_weight_size += costs[selected_idx]
+
+            # Check if we meet the constraint
+            meets_constraint = total_weight_size <= max_weight_size
+
+            if verbose:
+                print_rank_0(
+                    f"  Iteration {iteration + 1}: threshold={threshold:.6e}, "
+                    f"weight_size={total_weight_size:.2f}, "
+                    f"meets_constraint={meets_constraint}"
+                )
+
+            # Update binary search bounds
+            if meets_constraint:
+                upper_bound = threshold  # Threshold was too aggressive, relax it
+            else:
+                lower_bound = threshold  # Threshold was too lax, tighten it
+
+            # Update threshold for next iteration
+            threshold = (lower_bound + upper_bound) / 2.0
+
+        # Final check if constraint is satisfied
+        is_satisfied = total_weight_size <= max_weight_size
+
+        if verbose:
+            print_rank_0(
+                f"AutoQuantize: Search complete. "
+                f"Final weight size: {total_weight_size:.2f} "
+                f"(target: {max_weight_size:.2f}), "
+                f"constraint satisfied: {is_satisfied}"
+            )
+
+        return best_recipes, is_satisfied
 
 
 # Backward compatibility alias (defaults to gradient-based searcher)