From b76843a89213d69e5fa58272ac3dcee775dda81e Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Sun, 27 Jul 2025 18:20:20 -0700
Subject: [PATCH 01/11] Remove double baseline calculations

---
 .../microbenchmarks/benchmark_inference.py    | 180 +-----------------
 .../microbenchmarks/benchmark_runner.py       |   3 -
 benchmarks/microbenchmarks/utils.py           |  40 ++--
 torchao/testing/model_architectures.py        |   1 +
 4 files changed, 31 insertions(+), 193 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index 77ae7080ef..ecddc88b5f 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -1,181 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-
-# This source code is licensed under the license found in the
+#
+# This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
-"""
-Inference benchmark runner
-
-This script runs inference benchmarks and generates a micro-benchmarking report for it.
-- run() function is the main entry point for running inference benchmarks.
-"""
-
-import os
-from copy import deepcopy
-from pathlib import Path
-
-import torch
-
-from benchmarks.microbenchmarks.profiler import (
-    generate_memory_profile,
-    generate_model_profile,
-    visualize_memory_profile,
-)
-from benchmarks.microbenchmarks.utils import (
-    BenchmarkConfig,
-    BenchmarkResult,
-    clean_caches,
-    model_inference_time_in_ms,
-    string_to_config,
-)
-from torchao.quantization import quantize_
-from torchao.sparsity.sparse_api import sparsify_
-from torchao.testing.model_architectures import (
-    create_model_and_input_data,
-)
-
-
-def run(config: BenchmarkConfig) -> BenchmarkResult:
-    """Run inference benchmarks"""
-    try:
-        clean_caches()  # Clean caches
-
-        # Create output directory if it doesn't exist
-        Path(config.output_dir).mkdir(parents=True, exist_ok=True)
-
-        base_model, input_data = create_model_and_input_data(
-            config.model_type,
-            config.m,
-            config.k,
-            config.n,
-            high_precision_dtype=config.high_precision_dtype,
-            device=config.device,
-        )
-        # Copy base model for quantizing
-        m_copy = deepcopy(base_model)
-
-        # Run benchmarks
-        result = BenchmarkResult(config=config)
-
-        # Store result in model for memory profiling
-        base_model._benchmark_result = result
-
-        # Run baseline benchmarking
-        base_model = base_model.eval().to(config.device)
-        if config.use_torch_compile:
-            print("Compiling baseline model....")
-            base_model = torch.compile(
-                base_model, mode=config.torch_compile_mode, fullgraph=True
-            )
-        # Benchmark time to run an inference call for baseline model
-        print("Benchmarking baseline inference.....")
-        result.baseline_inference_time_in_ms = model_inference_time_in_ms(
-            model=base_model, input_data=input_data
-        )
-
-        ao_base_config = string_to_config(
-            config.quantization,
-            config.sparsity,
-            high_precision_dtype=config.high_precision_dtype,
-        )
-
-        # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
-        is_cuda = config.device == "cuda" and torch.cuda.is_available()
-
-        if config.sparsity is not None and (
-            config.quantization is None or "baseline" in config.quantization
-        ):
-            if is_cuda:
-                print(f"Applying {config.sparsity} sparsity to model")
-                sparsify_(m_copy, ao_base_config)
-            else:
-                print(
-                    f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
-                )
-        elif config.sparsity is None and (
-            config.quantization is None or "baseline" in config.quantization
-        ):
-            pass  # No quantization or sparsity specified, do nothing
-        else:
-            print("Quantizing model....")
-            m_copy = m_copy.eval().to(config.device)
-            quantize_(m_copy, ao_base_config)
-
-        if config.use_torch_compile:
-            print("Compiling quantized model....")
-            m_copy = torch.compile(
-                m_copy, mode=config.torch_compile_mode, fullgraph=True
-            )
-
-        # Store result in model for memory profiling
-        m_copy._benchmark_result = result
-
-        # Benchmark time to run an inference call for quantized model
-        print("Benchmarking quantized model.....")
-        result.model_inference_time_in_ms = model_inference_time_in_ms(
-            model=m_copy, input_data=input_data
-        )
-
-        # Calculate speedup w.r.t. baseline
-        result.speedup = round(
-            result.baseline_inference_time_in_ms / result.model_inference_time_in_ms, 2
-        )
-
-        # Run profiler if enabled
-        if config.enable_profiler:
-            print("Running profiler...")
-            try:
-                profiler_json_path = generate_model_profile(
-                    model=m_copy,
-                    input_data=input_data,
-                    profile_file_path=os.path.join(
-                        config.output_dir,
-                        "profiler",
-                        f"{config._file_name}_profile.json",
-                    ),
-                )
-                result.profiler_json_path = profiler_json_path
-            except Exception as e:
-                print(f"Error running profiler: {e}")
-
-        # Run memory profiler if enabled
-        if config.enable_memory_profiler:
-            print("Running memory profiler...")
-            try:
-                # Create memory profiler directory if it doesn't exist
-                memory_profiler_dir = os.path.join(
-                    config.output_dir, "memory_profiler/pickle"
-                )
-                os.makedirs(memory_profiler_dir, exist_ok=True)
-
-                # Save memory profile with .pickle extension
-                result.memory_profile_path, result.memory_stats = (
-                    generate_memory_profile(
-                        model=m_copy,
-                        input_data=input_data,
-                        profile_file_path=os.path.join(
-                            memory_profiler_dir,
-                            f"{config._file_name}_memory_profile.pickle",
-                        ),
-                    )
-                )
-
-                if result.memory_profile_path:
-                    result.memory_visualization_path = visualize_memory_profile(
-                        result.memory_profile_path
-                    )
-            except ValueError as e:
-                if "not enough values to unpack" in e:
-                    print(
-                        "Failed due to existing bugs, re-run the code to generate memory profile. Please raise an issue if it persists."
-                    )
-            except Exception as e:
-                print(f"Error running memory profiler: {e}")
-                import traceback
-
-                traceback.print_exc()
-
-        return result
-    except Exception as e:
-        print(f"Error in benchmark run: {config.name} with error: {e}")
-        return None
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
index 8066b71714..45a0534ee0 100644
--- a/benchmarks/microbenchmarks/benchmark_runner.py
+++ b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -139,9 +139,6 @@ def get_quantization_sparsity_recipes(
     """
     config_recipes = set()
 
-    # Always include baseline without sparsity
-    config_recipes.add(("baseline", None))
-
     # Add all quantization techniques without sparsity
     for quant_config in quantization_recipes:
         config_recipes.add((quant_config, None))
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index 40bce5c33d..d7229c8a14 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -125,9 +125,13 @@ def __init__(
     ):
         self.config = config
         self.output_dir = config.output_dir
-        self.baseline_inference_time_in_ms = 0.0
-        self.model_inference_time_in_ms = 0.0
-        self.speedup = 0.0
+        self.eager_baseline_inference_time_in_ms = 0.0
+        self.eager_model_inference_time_in_ms = 0.0
+        self.compile_baseline_inference_time_in_ms = 0.0
+        self.compile_model_inference_time_in_ms = 0.0
+        self.eager_speedup_on_baseline = 0.0
+        self.compile_speedup_on_baseline = 0.0
+        self.compile_speedup_on_eager = 0.0
         self.profiler_json_path: Optional[str] = None
         self.memory_profile_path: Optional[str] = None
         self.memory_visualization_path: Optional[str] = None
@@ -137,9 +141,13 @@ def to_dict(self) -> Dict[str, Any]:
         """Convert result to dictionary for main function"""
         result_dict = {
             **self.config.to_dict(),
-            "baseline_inference_time_in_ms": self.baseline_inference_time_in_ms,
-            "model_inference_time_in_ms": self.model_inference_time_in_ms,
-            "speedup": self.speedup,
+            "eager_baseline_inference_time_in_ms": self.eager_baseline_inference_time_in_ms,
+            "eager_model_inference_time_in_ms": self.eager_model_inference_time_in_ms,
+            "compile_baseline_inference_time_in_ms": self.compile_baseline_inference_time_in_ms,
+            "compile_model_inference_time_in_ms": self.compile_model_inference_time_in_ms,
+            "eager speedup on baseline": self.eager_speedup_on_baseline,
+            "compile speedup on baseline": self.compile_speedup_on_baseline,
+            "eager vs compile speedup": self.compile_speedup_on_eager,
             "profiler_json_path": self.profiler_json_path,
             "memory_profile_path": self.memory_profile_path,
             "memory_visualization_path": self.memory_visualization_path,
@@ -408,9 +416,13 @@ def print_results(results: List[BenchmarkResult]):
             result.config.quantization or "baseline",
             result.config.sparsity or "none",
             f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})",
-            f"{result.baseline_inference_time_in_ms:.2f}",
-            f"{result.model_inference_time_in_ms:.2f}",
-            f"{result.speedup:.2f}x",
+            f"{result.eager_baseline_inference_time_in_ms:.2f}",
+            f"{result.eager_model_inference_time_in_ms:.2f}",
+            f"{result.eager_speedup_on_baseline:.2f}x",
+            f"{result.compile_baseline_inference_time_in_ms:.2f}",
+            f"{result.compile_model_inference_time_in_ms:.2f}",
+            f"{result.compile_speedup_on_baseline:.2f}x",
+            f"{result.compile_speedup_on_eager:.2f}x",
             str(result.config.enable_profiler),
         ]
 
@@ -422,9 +434,13 @@ def print_results(results: List[BenchmarkResult]):
         "Quantization",
         "Sparsity",
         "Shape",
-        "Baseline Inference Time (ms)",
-        "Inference Time (ms)",
-        "Speedup",
+        "Eager Baseline Inference Time (ms)",
+        "Eager Model Inference Time (ms)",
+        "Eager Speedup",
+        "Compile Baseline Inference Time (ms)",
+        "Compile Model Inference Time (ms)",
+        "Compile Speedup",
+        "Eager vs Compile Speedup",
         "Profiler Enabled",
     ]
 
diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py
index f59a1271b1..9f02bbebc5 100644
--- a/torchao/testing/model_architectures.py
+++ b/torchao/testing/model_architectures.py
@@ -156,6 +156,7 @@ def create_model_and_input_data(
         high_precision_dtype (torch.dtype): data type of the model
         m, k, n (int): dimensions of the model and input data
     """
+    torch.manual_seed(42)
     if model_type == "linear":
         model = ToyLinearModel(k, n, high_precision_dtype).to(device)
         input_data = torch.randn(m, k, device=device, dtype=high_precision_dtype)

From 4da5639aec20e33334cc0d0e348710f7ff97c14a Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Sun, 27 Jul 2025 18:29:19 -0700
Subject: [PATCH 02/11] Calculate both compile and eager by default

---
 .../dashboard/microbenchmark_quantization_config.yml   |  1 -
 benchmarks/microbenchmarks/test/benchmark_config.yml   |  4 ----
 .../microbenchmarks/test/test_benchmark_inference.py   |  3 ---
 .../microbenchmarks/test/test_benchmark_runner.py      |  2 --
 benchmarks/microbenchmarks/test/test_utils.py          |  2 --
 benchmarks/microbenchmarks/utils.py                    | 10 ++--------
 docs/source/benchmarking_api_guide.md                  |  6 ++----
 7 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml
index 774237d54c..8156422668 100644
--- a/benchmarks/dashboard/microbenchmark_quantization_config.yml
+++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml
@@ -14,7 +14,6 @@ model_params:
         min_power: 10
         max_power: 15
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
index 4fd5eb2018..40db49e223 100644
--- a/benchmarks/microbenchmarks/test/benchmark_config.yml
+++ b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -13,7 +13,6 @@ model_params:
         min_power: 14
         max_power: 16
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
@@ -27,7 +26,6 @@ model_params:
           [2048, 4096, 1024],
         ]
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "ln_linear_sigmoid"
@@ -41,7 +39,6 @@ model_params:
           [2048, 4096, 1024],  # For transformer_block, k is the hidden dimension
         ]
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition)
@@ -58,7 +55,6 @@ model_params:
         min_power: 10  # 1024
         max_power: 11  # 2048
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
index 22863dcbcf..e0f55a6aca 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -21,7 +21,6 @@ def setUp(self):
             sparsity="semi-sparse",
             params={
                 "high_precision_dtype": "torch.float32",
-                "use_torch_compile": False,
                 "device": "cpu",
                 "model_type": "linear",
             },
@@ -64,7 +63,6 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
             sparsity="semi-sparse",
             params={
                 "high_precision_dtype": "torch.float32",
-                "use_torch_compile": False,
                 "device": "cpu",
                 "model_type": "linear",
             },
@@ -92,7 +90,6 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config):
             sparsity="block",
             params={
                 "high_precision_dtype": "torch.float32",
-                "use_torch_compile": False,
                 "device": "cpu",
                 "model_type": "linear",
             },
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_runner.py b/benchmarks/microbenchmarks/test/test_benchmark_runner.py
index 2f7e5ba541..f7e54e4bec 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_runner.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_runner.py
@@ -39,7 +39,6 @@ def setUp(self):
                         }
                     ],
                     "high_precision_dtype": "torch.bfloat16",
-                    "use_torch_compile": True,
                     "torch_compile_mode": "max-autotune",
                     "device": "cpu",
                     "model_type": "linear",
@@ -130,7 +129,6 @@ def test_get_param_combinations(self):
         self.assertEqual(len(shapes), 1)
         self.assertEqual(shapes[0], ("custom", [1024, 1024, 1024]))
         self.assertEqual(params["high_precision_dtype"], "torch.bfloat16")
-        self.assertEqual(params["use_torch_compile"], True)
 
     @patch("argparse.Namespace")
     def test_load_benchmark_configs(self, mock_args):
diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py
index 06f557a8f4..5d21260bf9 100644
--- a/benchmarks/microbenchmarks/test/test_utils.py
+++ b/benchmarks/microbenchmarks/test/test_utils.py
@@ -33,7 +33,6 @@ def setUp(self):
         self.test_params = {
             "name": "test_model",
             "high_precision_dtype": "torch.bfloat16",
-            "use_torch_compile": True,
             "torch_compile_mode": "max-autotune",
             "device": "cpu",
             "model_type": "linear",
@@ -57,7 +56,6 @@ def test_benchmark_config(self):
         self.assertEqual(config.k, 1024)
         self.assertEqual(config.n, 1024)
         self.assertEqual(config.high_precision_dtype, torch.bfloat16)
-        self.assertEqual(config.use_torch_compile, True)
         self.assertEqual(config.torch_compile_mode, "max-autotune")
         self.assertEqual(config.device, "cpu")
         self.assertEqual(config.model_type, "linear")
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index d7229c8a14..94c6f19b81 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -73,18 +73,13 @@ def __init__(
         self.high_precision_dtype = self._parse_precision(
             params.get("high_precision_dtype", "torch.bfloat16")
         )
-        self.use_torch_compile = bool(params.get("use_torch_compile", False))
-        self.torch_compile_mode = (
-            params.get("torch_compile_mode", "default")
-            if self.use_torch_compile
-            else None
-        )
+        self.torch_compile_mode = params.get("torch_compile_mode", "default")
         self.device = get_default_device(params.get("device", None))
         self.model_type = params.get("model_type", "linear")
         self.output_dir = f"{output_dir}/{self.benchmark_mode}"
         self.name = params.get(
             "name",
-            f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}",
+            f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile'}",
         )
         self.enable_profiler = bool(params.get("enable_profiler", False))
         self.enable_memory_profiler = bool(params.get("enable_memory_profiler", False))
@@ -108,7 +103,6 @@ def to_dict(self) -> Dict[str, Any]:
             "k": self.k,
             "n": self.n,
             "high_precision_dtype": self.high_precision_dtype,
-            "use_torch_compile": self.use_torch_compile,
             "torch_compile_mode": self.torch_compile_mode,
             "device": self.device,
             "model_type": self.model_type,
diff --git a/docs/source/benchmarking_api_guide.md b/docs/source/benchmarking_api_guide.md
index b07a0e14ff..bd81a7f65f 100644
--- a/docs/source/benchmarking_api_guide.md
+++ b/docs/source/benchmarking_api_guide.md
@@ -122,7 +122,6 @@ model_params:
         min_power: 10
         max_power: 15
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
@@ -199,9 +198,8 @@ python -m unittest discover benchmarks/microbenchmarks/test
 ### Common Issues
 
 1. **CUDA Out of Memory**: Reduce batch size or matrix dimensions
-2. **Compilation Errors**: Set `use_torch_compile: false` for debugging
-3. **Missing Quantization Methods**: Ensure TorchAO is properly installed
-4. **Device Not Available**: Check device availability and drivers
+2. **Missing Quantization Methods**: Ensure TorchAO is properly installed
+3. **Device Not Available**: Check device availability and drivers
 
 ### Best Practices
 

From 804bea7719e977b40309f41d4897de926bb9f2de Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Sun, 27 Jul 2025 18:43:02 -0700
Subject: [PATCH 03/11] Updates

---
 .../microbenchmarks/benchmark_inference.py    | 270 +++++++++++++++++-
 1 file changed, 268 insertions(+), 2 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index ecddc88b5f..47aff4d7be 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -1,5 +1,271 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-#
-# This source code is licensed under the BSD 3-Clause license found in the
+
+# This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""
+Inference benchmark runner
+
+This script runs inference benchmarks and generates a micro-benchmarking report for it.
+- run() function is the main entry point for running inference benchmarks.
+"""
+
+import os
+from copy import deepcopy
+from pathlib import Path
+from typing import Dict, Tuple
+
+import torch
+
+from benchmarks.microbenchmarks.profiler import (
+    generate_memory_profile,
+    generate_model_profile,
+    visualize_memory_profile,
+)
+from benchmarks.microbenchmarks.utils import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    clean_caches,
+    model_inference_time_in_ms,
+    string_to_config,
+)
+from torchao.quantization import quantize_
+from torchao.sparsity.sparse_api import sparsify_
+from torchao.testing.model_architectures import (
+    create_model_and_input_data,
+)
+
+# -----------------------------------------------------------------------------
+# Baseline caching
+#
+# ``_BASELINE_CACHE`` maps a unique key to a tuple
+# ``(eager_baseline_time, compile_baseline_time)``.  See ``_make_cache_key`` for the key
+# construction.  Users should not access this cache directly; it is
+# internal to this module.  The cache intentionally holds the
+# uncompiled base model so that quantized versions can be derived
+# without mutating the cached copy.
+
+_BASELINE_CACHE: Dict[Tuple, Tuple[float, float]] = {}
+
+
+def _make_cache_key(config: BenchmarkConfig) -> Tuple:
+    """Create a key for caching based on benchmark configuration.
+
+    Parameters that affect baseline performance are included:
+
+    * model type (e.g. ``linear`` or ``transformer_block``)
+    * shape dimensions (m, k, n)
+    * high precision dtype (bf16, fp16, etc.)
+    * device (cuda, cpu, mps)
+    * compile settings (whether compile is enabled and compile mode)
+
+    Sparsity and quantization settings are deliberately excluded
+    because the baseline (non‑quantized, non‑sparse) performance is
+    independent of those attributes.
+    """
+    return (
+        config.model_type,
+        config.m,
+        config.k,
+        config.n,
+        config.high_precision_dtype,
+        config.device,
+        config.torch_compile_mode,
+    )
+
+
+def run(config: BenchmarkConfig) -> BenchmarkResult:
+    """
+    Run inference benchmarks.
+
+    The function first checks if a baseline for the given configuration
+    already exists in the internal cache.  If not, it measures the baseline
+    inference time and stores the result.  When the baseline is cached,
+    the function reuses the stored model and input data to
+    benchmark quantized variants, avoiding redundant baseline measurements.
+
+    Args:
+        config (BenchmarkConfig): Benchmark configuration.
+
+    Returns:
+        BenchmarkResult: Result of the benchmark.
+    """
+    try:
+        clean_caches()  # Clean caches
+
+        # Create output directory if it doesn't exist
+        Path(config.output_dir).mkdir(parents=True, exist_ok=True)
+
+        # Prepare result container
+        result = BenchmarkResult(config=config)
+
+        # Create model and input data
+        base_model, input_data = create_model_and_input_data(
+            config.model_type,
+            config.m,
+            config.k,
+            config.n,
+            high_precision_dtype=config.high_precision_dtype,
+            device=config.device,
+        )
+
+        # Generate a cache key for the current configuration
+        cache_key = _make_cache_key(config)
+
+        # Check if the baseline for this configuration has been computed
+        if cache_key not in _BASELINE_CACHE:
+            # Switch model to eval and move to device
+            base_model = base_model.eval().to(config.device)
+            print("Benchmarking eager baseline inference.....")
+            eager_baseline_time = model_inference_time_in_ms(
+                model=base_model, input_data=input_data
+            )
+
+            print("Benchmarking compile baseline inference.....")
+            base_model = torch.compile(
+                base_model, mode=config.torch_compile_mode, fullgraph=True
+            )
+            compile_baseline_time = model_inference_time_in_ms(
+                model=base_model, input_data=input_data
+            )
+
+            # Store uncompiled model, input and baseline time
+            _BASELINE_CACHE[cache_key] = (eager_baseline_time, compile_baseline_time)
+
+            result.eager_baseline_inference_time_in_ms = eager_baseline_time
+            result.compile_baseline_inference_time_in_ms = compile_baseline_time
+        else:
+            # Retrieve cached values
+            cached_eager_time, cached_compile_time = _BASELINE_CACHE[cache_key]
+            result.eager_baseline_inference_time_in_ms = cached_eager_time
+            result.compile_baseline_inference_time_in_ms = cached_compile_time
+
+        # At this point, ``base_model`` is an uncompiled model ready for quantization,
+        # and ``input_data`` is the corresponding input tensor.  The baseline time
+        # has been stored in ``result.baseline_inference_time_in_ms``.
+
+        # Copy base model for quantizing/sparsifying
+        m_copy = deepcopy(base_model)
+
+        # Determine quantization/sparsity configuration
+        ao_base_config = string_to_config(
+            config.quantization,
+            config.sparsity,
+            high_precision_dtype=config.high_precision_dtype,
+        )
+
+        # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
+        is_cuda = config.device == "cuda" and torch.cuda.is_available()
+
+        if config.sparsity is not None and (
+            config.quantization is None or "baseline" in config.quantization
+        ):
+            if is_cuda:
+                print(f"Applying {config.sparsity} sparsity to model")
+                sparsify_(m_copy, ao_base_config)
+            else:
+                print(
+                    f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
+                )
+        elif config.sparsity is None and (
+            config.quantization is None or "baseline" in config.quantization
+        ):
+            pass  # No quantization or sparsity specified, do nothing
+        else:
+            print("Quantizing model....")
+            m_copy = m_copy.eval().to(config.device)
+            quantize_(m_copy, ao_base_config)
+
+        # Store result in model for memory profiling
+        m_copy._benchmark_result = result
+
+        # Measure inference time for quantized model
+        print("Benchmarking eager quantized model.....")
+        result.eager_model_inference_time_in_ms = model_inference_time_in_ms(
+            model=m_copy, input_data=input_data
+        )
+
+        # Measure inference time for compiled quantized model
+        print("Benchmarking quantized model.....")
+        m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
+        result.compile_model_inference_time_in_ms = model_inference_time_in_ms(
+            model=m_copy, input_data=input_data
+        )
+
+        # Compute eager speedup relative to baseline
+        result.eager_speedup_on_baseline = round(
+            result.eager_baseline_inference_time_in_ms
+            / result.eager_model_inference_time_in_ms,
+            2,
+        )
+        # Compute compile speedup relative to baseline
+        result.compile_speedup_on_baseline = round(
+            result.compile_baseline_inference_time_in_ms
+            / result.compile_model_inference_time_in_ms,
+            2,
+        )
+        # Compute compile speedup for quantized model relative to eager quantized model
+        result.compile_speedup_on_eager = round(
+            result.eager_model_inference_time_in_ms
+            / result.compile_model_inference_time_in_ms,
+            2,
+        )
+
+        # Run profiler if enabled
+        if config.enable_profiler:
+            print("Running profiler...")
+            try:
+                profiler_json_path = generate_model_profile(
+                    model=m_copy,
+                    input_data=input_data,
+                    profile_file_path=os.path.join(
+                        config.output_dir,
+                        "profiler",
+                        f"{config._file_name}_profile.json",
+                    ),
+                )
+                result.profiler_json_path = profiler_json_path
+            except Exception as e:
+                print(f"Error running profiler: {e}")
+
+        # Run memory profiler if enabled
+        if config.enable_memory_profiler:
+            print("Running memory profiler...")
+            try:
+                # Create memory profiler directory if it doesn't exist
+                memory_profiler_dir = os.path.join(
+                    config.output_dir, "memory_profiler/pickle"
+                )
+                os.makedirs(memory_profiler_dir, exist_ok=True)
+
+                # Save memory profile with .pickle extension
+                result.memory_profile_path, result.memory_stats = (
+                    generate_memory_profile(
+                        model=m_copy,
+                        input_data=input_data,
+                        profile_file_path=os.path.join(
+                            memory_profiler_dir,
+                            f"{config._file_name}_memory_profile.pickle",
+                        ),
+                    )
+                )
+
+                if result.memory_profile_path:
+                    result.memory_visualization_path = visualize_memory_profile(
+                        result.memory_profile_path
+                    )
+            except ValueError as e:
+                if "not enough values to unpack" in str(e):
+                    print(
+                        "Failed due to existing bugs, re‑run the code to generate memory profile. Please raise an issue if it persists."
+                    )
+            except Exception as e:
+                print(f"Error running memory profiler: {e}")
+                import traceback
+
+                traceback.print_exc()
+
+        return result
+    except Exception as e:
+        print(f"Error in benchmark run: {config.name} with error: {e}")
+        return None

From a193e4cf90a2e8a60a85dbb67ee9987977cad389 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Sun, 27 Jul 2025 18:45:48 -0700
Subject: [PATCH 04/11] Updates

---
 benchmarks/microbenchmarks/benchmark_inference.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index 47aff4d7be..b0d617f32d 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -81,8 +81,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
     The function first checks if a baseline for the given configuration
     already exists in the internal cache.  If not, it measures the baseline
     inference time and stores the result.  When the baseline is cached,
-    the function reuses the stored model and input data to
-    benchmark quantized variants, avoiding redundant baseline measurements.
+    the function reuses the cached baselines to calculate speedup metrics.
 
     Args:
         config (BenchmarkConfig): Benchmark configuration.

From d0b318fea309312c6e7a50824ec7cc728ca4e8c7 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Sun, 27 Jul 2025 23:37:37 -0700
Subject: [PATCH 05/11] Update CI run

---
 .../dashboard/ci_microbenchmark_runner.py     | 72 ++++++++++++++++---
 1 file changed, 63 insertions(+), 9 deletions(-)

diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py
index a8b7ae048d..3fda108f20 100644
--- a/benchmarks/dashboard/ci_microbenchmark_runner.py
+++ b/benchmarks/dashboard/ci_microbenchmark_runner.py
@@ -121,21 +121,23 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
 
         if result is not None:
             # Create benchmark result in OSS format
-            speedup_result = create_benchmark_result(
+
+            ## Compile mode results
+            compile_speedup_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Fwd Speedup (x)",
-                metric_values=[result.speedup],
+                metric_values=[result.compile_speedup_on_baseline],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
             )
-            results.append(speedup_result)
-            baseline_time_result = create_benchmark_result(
+            results.append(compile_speedup_result)
+            compile_baseline_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Bfloat16 Fwd Time (ms)",
-                metric_values=[result.baseline_inference_time_in_ms],
+                metric_values=[result.compile_baseline_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -143,12 +145,37 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                     "unit": "ms",
                 },
             )
-            results.append(baseline_time_result)
-            quantize_time_result = create_benchmark_result(
+            results.append(compile_baseline_time_result)
+            compile_quantize_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Quantized Fwd Time (ms)",
-                metric_values=[result.model_inference_time_in_ms],
+                metric_values=[result.compile_model_inference_time_in_ms],
+                quant_type=config.quantization,
+                device=config.device,
+                torch_compile_mode=config.torch_compile_mode,
+                metric_extra_info={
+                    "unit": "ms",
+                },
+            )
+            results.append(compile_quantize_time_result)
+
+            ## Eager mode results
+            eager_speedup_result = create_benchmark_result(
+                benchmark_name="TorchAO Quantization Benchmark",
+                shape=[config.m, config.k, config.n],
+                metric_name="Fwd Speedup (x)",
+                metric_values=[result.eager_speedup_on_baseline],
+                quant_type=config.quantization,
+                device=config.device,
+                torch_compile_mode=config.torch_compile_mode,
+            )
+            results.append(eager_speedup_result)
+            eager_baseline_time_result = create_benchmark_result(
+                benchmark_name="TorchAO Quantization Benchmark",
+                shape=[config.m, config.k, config.n],
+                metric_name="Bfloat16 Fwd Time (ms)",
+                metric_values=[result.eager_baseline_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -156,7 +183,34 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                     "unit": "ms",
                 },
             )
-            results.append(quantize_time_result)
+            results.append(eager_baseline_time_result)
+            eager_quantize_time_result = create_benchmark_result(
+                benchmark_name="TorchAO Quantization Benchmark",
+                shape=[config.m, config.k, config.n],
+                metric_name="Quantized Fwd Time (ms)",
+                metric_values=[result.eager_model_inference_time_in_ms],
+                quant_type=config.quantization,
+                device=config.device,
+                torch_compile_mode=config.torch_compile_mode,
+                metric_extra_info={
+                    "unit": "ms",
+                },
+            )
+            results.append(eager_quantize_time_result)
+
+            ## Compile vs eager results
+            compile_eager_speedup_result = create_benchmark_result(
+                benchmark_name="TorchAO Quantization Benchmark",
+                shape=[config.m, config.k, config.n],
+                metric_name="Eager vs Compile Fwd Speedup (x)",
+                metric_values=[result.compile_speedup_on_eager],
+                quant_type=config.quantization,
+                device=config.device,
+                torch_compile_mode=config.torch_compile_mode,
+            )
+            results.append(compile_eager_speedup_result)
+
+            ## Memory results
             allocated_memory_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],

From 63cd52495800895195e260a9ff3e6af114ba2750 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Mon, 28 Jul 2025 09:42:45 -0700
Subject: [PATCH 06/11] Update column names

---
 benchmarks/dashboard/ci_microbenchmark_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py
index 3fda108f20..dc9c1b4963 100644
--- a/benchmarks/dashboard/ci_microbenchmark_runner.py
+++ b/benchmarks/dashboard/ci_microbenchmark_runner.py
@@ -164,7 +164,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
             eager_speedup_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
-                metric_name="Fwd Speedup (x)",
+                metric_name="Fwd Speedup w/ Eager (x)",
                 metric_values=[result.eager_speedup_on_baseline],
                 quant_type=config.quantization,
                 device=config.device,
@@ -174,7 +174,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
             eager_baseline_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
-                metric_name="Bfloat16 Fwd Time (ms)",
+                metric_name="Bfloat16 Fwd Time w/ Eager (ms)",
                 metric_values=[result.eager_baseline_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
@@ -187,7 +187,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
             eager_quantize_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
-                metric_name="Quantized Fwd Time (ms)",
+                metric_name="Quantized Fwd Time w/ Eager (ms)",
                 metric_values=[result.eager_model_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,

From 5a35513cbb92d7e3ba888a4f2805cd73a9e5048a Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Mon, 28 Jul 2025 12:21:40 -0700
Subject: [PATCH 07/11] update comments

---
 benchmarks/dashboard/ci_microbenchmark_runner.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py
index dc9c1b4963..d492712d85 100644
--- a/benchmarks/dashboard/ci_microbenchmark_runner.py
+++ b/benchmarks/dashboard/ci_microbenchmark_runner.py
@@ -120,9 +120,9 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
         result = run_inference(config)
 
         if result is not None:
-            # Create benchmark result in OSS format
+            ## Create benchmark result in OSS format
 
-            ## Compile mode results
+            # Compile mode speedup
             compile_speedup_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
@@ -133,6 +133,8 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 torch_compile_mode=config.torch_compile_mode,
             )
             results.append(compile_speedup_result)
+
+            # Compile mode baseline
             compile_baseline_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
@@ -146,6 +148,8 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 },
             )
             results.append(compile_baseline_time_result)
+
+            # Compile mode quantized
             compile_quantize_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
@@ -160,7 +164,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
             )
             results.append(compile_quantize_time_result)
 
-            ## Eager mode results
+            # Eager mode speedup
             eager_speedup_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
@@ -171,6 +175,8 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 torch_compile_mode=config.torch_compile_mode,
             )
             results.append(eager_speedup_result)
+
+            # Eager mode baseline
             eager_baseline_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
@@ -184,6 +190,8 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 },
             )
             results.append(eager_baseline_time_result)
+
+            # Eager mode quantized
             eager_quantize_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],

From b089e30c9f3730f15c8c766019bcd53d465c950a Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Mon, 28 Jul 2025 12:29:07 -0700
Subject: [PATCH 08/11] test updates

---
 .../microbenchmarks/test/test_benchmark_inference.py  |  6 +++---
 .../microbenchmarks/test/test_benchmark_profiler.py   | 11 +++++------
 benchmarks/microbenchmarks/test/test_utils.py         |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
index e0f55a6aca..38ffcc5a6c 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -45,7 +45,7 @@ def test_run_inference(self, mock_string_to_config):
 
         result = run(self.config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+        self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms"))
 
     @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
     def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
@@ -73,7 +73,7 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
         )
         result = run(config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+        self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms"))
 
     @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
     def test_run_inference_with_block_sparsity(self, mock_string_to_config):
@@ -100,7 +100,7 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config):
         )
         result = run(config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+        self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms"))
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py
index 92689c4802..d0c36d8cfe 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py
@@ -270,13 +270,12 @@ def test_memory_profiler_cuda_unavailable(self):
                 f"{config.name}_{self.m}_{self.k}_{self.n}_memory_profile.json",
             )
 
-            # Generate memory profile
-            result, memory_stats = generate_memory_profile(
-                self.model, self.input_data, memory_profile_path
-            )
-
             # Should return None when CUDA is unavailable
-            self.assertIsNone(result)
+            self.assertIsNone(
+                generate_memory_profile(
+                    self.model, self.input_data, memory_profile_path
+                )
+            )
 
             # Should not create file when CUDA is unavailable
             self.assertFalse(os.path.exists(memory_profile_path))
diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py
index 5d21260bf9..864c521251 100644
--- a/benchmarks/microbenchmarks/test/test_utils.py
+++ b/benchmarks/microbenchmarks/test/test_utils.py
@@ -74,7 +74,7 @@ def test_benchmark_result(self):
         result = BenchmarkResult(config=config)
 
         self.assertEqual(result.config, config)
-        self.assertEqual(result.model_inference_time_in_ms, 0.0)
+        self.assertEqual(result.compile_model_inference_time_in_ms, 0.0)
 
     def test_get_default_device(self):
         # Test CPU fallback

From 28f3f6a4a83160863746bab908e091f494b91314 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Fri, 1 Aug 2025 14:30:35 -0700
Subject: [PATCH 09/11] remove dashboard updates'

---
 .../dashboard/ci_microbenchmark_runner.py     | 82 +++----------------
 .../microbenchmark_quantization_config.yml    |  1 +
 2 files changed, 11 insertions(+), 72 deletions(-)

diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py
index d492712d85..a8b7ae048d 100644
--- a/benchmarks/dashboard/ci_microbenchmark_runner.py
+++ b/benchmarks/dashboard/ci_microbenchmark_runner.py
@@ -120,26 +120,22 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
         result = run_inference(config)
 
         if result is not None:
-            ## Create benchmark result in OSS format
-
-            # Compile mode speedup
-            compile_speedup_result = create_benchmark_result(
+            # Create benchmark result in OSS format
+            speedup_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Fwd Speedup (x)",
-                metric_values=[result.compile_speedup_on_baseline],
+                metric_values=[result.speedup],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
             )
-            results.append(compile_speedup_result)
-
-            # Compile mode baseline
-            compile_baseline_time_result = create_benchmark_result(
+            results.append(speedup_result)
+            baseline_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Bfloat16 Fwd Time (ms)",
-                metric_values=[result.compile_baseline_inference_time_in_ms],
+                metric_values=[result.baseline_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -147,41 +143,12 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                     "unit": "ms",
                 },
             )
-            results.append(compile_baseline_time_result)
-
-            # Compile mode quantized
-            compile_quantize_time_result = create_benchmark_result(
+            results.append(baseline_time_result)
+            quantize_time_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Quantized Fwd Time (ms)",
-                metric_values=[result.compile_model_inference_time_in_ms],
-                quant_type=config.quantization,
-                device=config.device,
-                torch_compile_mode=config.torch_compile_mode,
-                metric_extra_info={
-                    "unit": "ms",
-                },
-            )
-            results.append(compile_quantize_time_result)
-
-            # Eager mode speedup
-            eager_speedup_result = create_benchmark_result(
-                benchmark_name="TorchAO Quantization Benchmark",
-                shape=[config.m, config.k, config.n],
-                metric_name="Fwd Speedup w/ Eager (x)",
-                metric_values=[result.eager_speedup_on_baseline],
-                quant_type=config.quantization,
-                device=config.device,
-                torch_compile_mode=config.torch_compile_mode,
-            )
-            results.append(eager_speedup_result)
-
-            # Eager mode baseline
-            eager_baseline_time_result = create_benchmark_result(
-                benchmark_name="TorchAO Quantization Benchmark",
-                shape=[config.m, config.k, config.n],
-                metric_name="Bfloat16 Fwd Time w/ Eager (ms)",
-                metric_values=[result.eager_baseline_inference_time_in_ms],
+                metric_values=[result.model_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -189,36 +156,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                     "unit": "ms",
                 },
             )
-            results.append(eager_baseline_time_result)
-
-            # Eager mode quantized
-            eager_quantize_time_result = create_benchmark_result(
-                benchmark_name="TorchAO Quantization Benchmark",
-                shape=[config.m, config.k, config.n],
-                metric_name="Quantized Fwd Time w/ Eager (ms)",
-                metric_values=[result.eager_model_inference_time_in_ms],
-                quant_type=config.quantization,
-                device=config.device,
-                torch_compile_mode=config.torch_compile_mode,
-                metric_extra_info={
-                    "unit": "ms",
-                },
-            )
-            results.append(eager_quantize_time_result)
-
-            ## Compile vs eager results
-            compile_eager_speedup_result = create_benchmark_result(
-                benchmark_name="TorchAO Quantization Benchmark",
-                shape=[config.m, config.k, config.n],
-                metric_name="Eager vs Compile Fwd Speedup (x)",
-                metric_values=[result.compile_speedup_on_eager],
-                quant_type=config.quantization,
-                device=config.device,
-                torch_compile_mode=config.torch_compile_mode,
-            )
-            results.append(compile_eager_speedup_result)
-
-            ## Memory results
+            results.append(quantize_time_result)
             allocated_memory_result = create_benchmark_result(
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml
index 8156422668..774237d54c 100644
--- a/benchmarks/dashboard/microbenchmark_quantization_config.yml
+++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml
@@ -14,6 +14,7 @@ model_params:
         min_power: 10
         max_power: 15
     high_precision_dtype: "torch.bfloat16"
+    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"

From d7aa7abc708300a7678eb75e17f421cefef415d4 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Mon, 4 Aug 2025 12:58:12 -0700
Subject: [PATCH 10/11] updates

---
 .../microbenchmarks/benchmark_inference.py    | 51 ++++++++++---------
 .../test/test_benchmark_inference.py          | 12 +++--
 benchmarks/microbenchmarks/test/test_utils.py |  2 +-
 benchmarks/microbenchmarks/utils.py           | 24 ++++-----
 4 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index b0d617f32d..7aa710da77 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -38,12 +38,12 @@
 # -----------------------------------------------------------------------------
 # Baseline caching
 #
-# ``_BASELINE_CACHE`` maps a unique key to a tuple
+# ``_BASELINE_CACHE`` maps a unique key constructed using _make_cache_key(config) -> (model_type, m, k, n, high_precision_dtype, device, torch_compile_mode) to a tuple
 # ``(eager_baseline_time, compile_baseline_time)``.  See ``_make_cache_key`` for the key
 # construction.  Users should not access this cache directly; it is
-# internal to this module.  The cache intentionally holds the
-# uncompiled base model so that quantized versions can be derived
-# without mutating the cached copy.
+# internal to this module.
+# Eg: (linear, 1024, 1024, 1024, torch.bfloat16, cuda, default) -> (95.00, 56.00)
+# -----------------------------------------------------------------------------
 
 _BASELINE_CACHE: Dict[Tuple, Tuple[float, float]] = {}
 
@@ -114,30 +114,31 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         # Check if the baseline for this configuration has been computed
         if cache_key not in _BASELINE_CACHE:
             # Switch model to eval and move to device
-            base_model = base_model.eval().to(config.device)
+            m_copy = deepcopy(base_model)
+            m_copy = m_copy.eval().to(config.device)
             print("Benchmarking eager baseline inference.....")
             eager_baseline_time = model_inference_time_in_ms(
-                model=base_model, input_data=input_data
+                model=m_copy, input_data=input_data
             )
 
             print("Benchmarking compile baseline inference.....")
-            base_model = torch.compile(
-                base_model, mode=config.torch_compile_mode, fullgraph=True
+            m_copy = torch.compile(
+                m_copy, mode=config.torch_compile_mode, fullgraph=True
             )
             compile_baseline_time = model_inference_time_in_ms(
-                model=base_model, input_data=input_data
+                model=m_copy, input_data=input_data
             )
 
             # Store uncompiled model, input and baseline time
             _BASELINE_CACHE[cache_key] = (eager_baseline_time, compile_baseline_time)
 
-            result.eager_baseline_inference_time_in_ms = eager_baseline_time
-            result.compile_baseline_inference_time_in_ms = compile_baseline_time
+            result.baseline_model_eager_inference_time_in_ms = eager_baseline_time
+            result.baseline_model_compiled_inference_time_in_ms = compile_baseline_time
         else:
             # Retrieve cached values
             cached_eager_time, cached_compile_time = _BASELINE_CACHE[cache_key]
-            result.eager_baseline_inference_time_in_ms = cached_eager_time
-            result.compile_baseline_inference_time_in_ms = cached_compile_time
+            result.baseline_model_eager_inference_time_in_ms = cached_eager_time
+            result.baseline_model_compiled_inference_time_in_ms = cached_compile_time
 
         # At this point, ``base_model`` is an uncompiled model ready for quantization,
         # and ``input_data`` is the corresponding input tensor.  The baseline time
@@ -180,34 +181,34 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
 
         # Measure inference time for quantized model
         print("Benchmarking eager quantized model.....")
-        result.eager_model_inference_time_in_ms = model_inference_time_in_ms(
+        result.quantized_model_eager_inference_time_in_ms = model_inference_time_in_ms(
             model=m_copy, input_data=input_data
         )
 
         # Measure inference time for compiled quantized model
         print("Benchmarking quantized model.....")
         m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
-        result.compile_model_inference_time_in_ms = model_inference_time_in_ms(
-            model=m_copy, input_data=input_data
+        result.quantized_model_compiled_inference_time_in_ms = (
+            model_inference_time_in_ms(model=m_copy, input_data=input_data)
         )
 
         # Compute eager speedup relative to baseline
         result.eager_speedup_on_baseline = round(
-            result.eager_baseline_inference_time_in_ms
-            / result.eager_model_inference_time_in_ms,
-            2,
+            result.baseline_model_eager_inference_time_in_ms
+            / result.quantized_model_eager_inference_time_in_ms,
+            ndigits=2,
         )
         # Compute compile speedup relative to baseline
         result.compile_speedup_on_baseline = round(
-            result.compile_baseline_inference_time_in_ms
-            / result.compile_model_inference_time_in_ms,
-            2,
+            result.baseline_model_compiled_inference_time_in_ms
+            / result.quantized_model_compiled_inference_time_in_ms,
+            ndigits=2,
         )
         # Compute compile speedup for quantized model relative to eager quantized model
         result.compile_speedup_on_eager = round(
-            result.eager_model_inference_time_in_ms
-            / result.compile_model_inference_time_in_ms,
-            2,
+            result.quantized_model_eager_inference_time_in_ms
+            / result.quantized_model_compiled_inference_time_in_ms,
+            ndigits=2,
         )
 
         # Run profiler if enabled
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
index 38ffcc5a6c..a2798799a6 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -45,7 +45,9 @@ def test_run_inference(self, mock_string_to_config):
 
         result = run(self.config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms"))
+        self.assertTrue(
+            hasattr(result, "quantized_model_compiled_inference_time_in_ms")
+        )
 
     @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
     def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
@@ -73,7 +75,9 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
         )
         result = run(config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms"))
+        self.assertTrue(
+            hasattr(result, "quantized_model_compiled_inference_time_in_ms")
+        )
 
     @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
     def test_run_inference_with_block_sparsity(self, mock_string_to_config):
@@ -100,7 +104,9 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config):
         )
         result = run(config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms"))
+        self.assertTrue(
+            hasattr(result, "quantized_model_compiled_inference_time_in_ms")
+        )
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py
index 864c521251..64af5b67e6 100644
--- a/benchmarks/microbenchmarks/test/test_utils.py
+++ b/benchmarks/microbenchmarks/test/test_utils.py
@@ -74,7 +74,7 @@ def test_benchmark_result(self):
         result = BenchmarkResult(config=config)
 
         self.assertEqual(result.config, config)
-        self.assertEqual(result.compile_model_inference_time_in_ms, 0.0)
+        self.assertEqual(result.quantized_model_compiled_inference_time_in_ms, 0.0)
 
     def test_get_default_device(self):
         # Test CPU fallback
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index 94c6f19b81..e50f5a065c 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -119,10 +119,10 @@ def __init__(
     ):
         self.config = config
         self.output_dir = config.output_dir
-        self.eager_baseline_inference_time_in_ms = 0.0
-        self.eager_model_inference_time_in_ms = 0.0
-        self.compile_baseline_inference_time_in_ms = 0.0
-        self.compile_model_inference_time_in_ms = 0.0
+        self.baseline_model_eager_inference_time_in_ms = 0.0
+        self.quantized_model_eager_inference_time_in_ms = 0.0
+        self.baseline_model_compiled_inference_time_in_ms = 0.0
+        self.quantized_model_compiled_inference_time_in_ms = 0.0
         self.eager_speedup_on_baseline = 0.0
         self.compile_speedup_on_baseline = 0.0
         self.compile_speedup_on_eager = 0.0
@@ -135,10 +135,10 @@ def to_dict(self) -> Dict[str, Any]:
         """Convert result to dictionary for main function"""
         result_dict = {
             **self.config.to_dict(),
-            "eager_baseline_inference_time_in_ms": self.eager_baseline_inference_time_in_ms,
-            "eager_model_inference_time_in_ms": self.eager_model_inference_time_in_ms,
-            "compile_baseline_inference_time_in_ms": self.compile_baseline_inference_time_in_ms,
-            "compile_model_inference_time_in_ms": self.compile_model_inference_time_in_ms,
+            "baseline_model_eager_inference_time_in_ms": self.baseline_model_eager_inference_time_in_ms,
+            "quantized_model_eager_inference_time_in_ms": self.quantized_model_eager_inference_time_in_ms,
+            "baseline_model_compiled_inference_time_in_ms": self.baseline_model_compiled_inference_time_in_ms,
+            "quantized_model_compiled_inference_time_in_ms": self.quantized_model_compiled_inference_time_in_ms,
             "eager speedup on baseline": self.eager_speedup_on_baseline,
             "compile speedup on baseline": self.compile_speedup_on_baseline,
             "eager vs compile speedup": self.compile_speedup_on_eager,
@@ -410,11 +410,11 @@ def print_results(results: List[BenchmarkResult]):
             result.config.quantization or "baseline",
             result.config.sparsity or "none",
             f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})",
-            f"{result.eager_baseline_inference_time_in_ms:.2f}",
-            f"{result.eager_model_inference_time_in_ms:.2f}",
+            f"{result.baseline_model_eager_inference_time_in_ms:.2f}",
+            f"{result.quantized_model_eager_inference_time_in_ms:.2f}",
             f"{result.eager_speedup_on_baseline:.2f}x",
-            f"{result.compile_baseline_inference_time_in_ms:.2f}",
-            f"{result.compile_model_inference_time_in_ms:.2f}",
+            f"{result.baseline_model_compiled_inference_time_in_ms:.2f}",
+            f"{result.quantized_model_compiled_inference_time_in_ms:.2f}",
             f"{result.compile_speedup_on_baseline:.2f}x",
             f"{result.compile_speedup_on_eager:.2f}x",
             str(result.config.enable_profiler),

From 7ca80b2221bb8484eea3448313f1d0e7b23a4260 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 5 Aug 2025 12:18:28 -0700
Subject: [PATCH 11/11] updates

---
 benchmarks/dashboard/ci_microbenchmark_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py
index a8b7ae048d..e6665caa53 100644
--- a/benchmarks/dashboard/ci_microbenchmark_runner.py
+++ b/benchmarks/dashboard/ci_microbenchmark_runner.py
@@ -125,7 +125,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Fwd Speedup (x)",
-                metric_values=[result.speedup],
+                metric_values=[result.compile_speedup_on_baseline],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -135,7 +135,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Bfloat16 Fwd Time (ms)",
-                metric_values=[result.baseline_inference_time_in_ms],
+                metric_values=[result.baseline_model_compiled_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -148,7 +148,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Quantized Fwd Time (ms)",
-                metric_values=[result.model_inference_time_in_ms],
+                metric_values=[result.quantized_model_compiled_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,