From b76843a89213d69e5fa58272ac3dcee775dda81e Mon Sep 17 00:00:00 2001 From: jainapurva Date: Sun, 27 Jul 2025 18:20:20 -0700 Subject: [PATCH 01/11] Remove double baseline calculations --- .../microbenchmarks/benchmark_inference.py | 180 +----------------- .../microbenchmarks/benchmark_runner.py | 3 - benchmarks/microbenchmarks/utils.py | 40 ++-- torchao/testing/model_architectures.py | 1 + 4 files changed, 31 insertions(+), 193 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index 77ae7080ef..ecddc88b5f 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -1,181 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. - -# This source code is licensed under the license found in the +# +# This source code is licensed under the BSD 3-Clause license found in the # LICENSE file in the root directory of this source tree. -""" -Inference benchmark runner - -This script runs inference benchmarks and generates a micro-benchmarking report for it. -- run() function is the main entry point for running inference benchmarks. -""" - -import os -from copy import deepcopy -from pathlib import Path - -import torch - -from benchmarks.microbenchmarks.profiler import ( - generate_memory_profile, - generate_model_profile, - visualize_memory_profile, -) -from benchmarks.microbenchmarks.utils import ( - BenchmarkConfig, - BenchmarkResult, - clean_caches, - model_inference_time_in_ms, - string_to_config, -) -from torchao.quantization import quantize_ -from torchao.sparsity.sparse_api import sparsify_ -from torchao.testing.model_architectures import ( - create_model_and_input_data, -) - - -def run(config: BenchmarkConfig) -> BenchmarkResult: - """Run inference benchmarks""" - try: - clean_caches() # Clean caches - - # Create output directory if it doesn't exist - Path(config.output_dir).mkdir(parents=True, exist_ok=True) - - base_model, input_data = create_model_and_input_data( - config.model_type, - config.m, - config.k, - config.n, - high_precision_dtype=config.high_precision_dtype, - device=config.device, - ) - # Copy base model for quantizing - m_copy = deepcopy(base_model) - - # Run benchmarks - result = BenchmarkResult(config=config) - - # Store result in model for memory profiling - base_model._benchmark_result = result - - # Run baseline benchmarking - base_model = base_model.eval().to(config.device) - if config.use_torch_compile: - print("Compiling baseline model....") - base_model = torch.compile( - base_model, mode=config.torch_compile_mode, fullgraph=True - ) - # Benchmark time to run an inference call for baseline model - print("Benchmarking baseline inference.....") - result.baseline_inference_time_in_ms = model_inference_time_in_ms( - model=base_model, input_data=input_data - ) - - ao_base_config = string_to_config( - config.quantization, - config.sparsity, - high_precision_dtype=config.high_precision_dtype, - ) - - # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) - is_cuda = config.device == "cuda" and torch.cuda.is_available() - - if config.sparsity is not None and ( - config.quantization is None or "baseline" in config.quantization - ): - if is_cuda: - print(f"Applying {config.sparsity} sparsity to model") - sparsify_(m_copy, ao_base_config) - else: - print( - f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" - ) - elif config.sparsity is None and ( - config.quantization is None or "baseline" in config.quantization - ): - pass # No quantization or sparsity specified, do nothing - else: - print("Quantizing model....") - m_copy = m_copy.eval().to(config.device) - quantize_(m_copy, ao_base_config) - - if config.use_torch_compile: - print("Compiling quantized model....") - m_copy = torch.compile( - m_copy, mode=config.torch_compile_mode, fullgraph=True - ) - - # Store result in model for memory profiling - m_copy._benchmark_result = result - - # Benchmark time to run an inference call for quantized model - print("Benchmarking quantized model.....") - result.model_inference_time_in_ms = model_inference_time_in_ms( - model=m_copy, input_data=input_data - ) - - # Calculate speedup w.r.t. baseline - result.speedup = round( - result.baseline_inference_time_in_ms / result.model_inference_time_in_ms, 2 - ) - - # Run profiler if enabled - if config.enable_profiler: - print("Running profiler...") - try: - profiler_json_path = generate_model_profile( - model=m_copy, - input_data=input_data, - profile_file_path=os.path.join( - config.output_dir, - "profiler", - f"{config._file_name}_profile.json", - ), - ) - result.profiler_json_path = profiler_json_path - except Exception as e: - print(f"Error running profiler: {e}") - - # Run memory profiler if enabled - if config.enable_memory_profiler: - print("Running memory profiler...") - try: - # Create memory profiler directory if it doesn't exist - memory_profiler_dir = os.path.join( - config.output_dir, "memory_profiler/pickle" - ) - os.makedirs(memory_profiler_dir, exist_ok=True) - - # Save memory profile with .pickle extension - result.memory_profile_path, result.memory_stats = ( - generate_memory_profile( - model=m_copy, - input_data=input_data, - profile_file_path=os.path.join( - memory_profiler_dir, - f"{config._file_name}_memory_profile.pickle", - ), - ) - ) - - if result.memory_profile_path: - result.memory_visualization_path = visualize_memory_profile( - result.memory_profile_path - ) - except ValueError as e: - if "not enough values to unpack" in e: - print( - "Failed due to existing bugs, re-run the code to generate memory profile. Please raise an issue if it persists." - ) - except Exception as e: - print(f"Error running memory profiler: {e}") - import traceback - - traceback.print_exc() - - return result - except Exception as e: - print(f"Error in benchmark run: {config.name} with error: {e}") - return None diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 8066b71714..45a0534ee0 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -139,9 +139,6 @@ def get_quantization_sparsity_recipes( """ config_recipes = set() - # Always include baseline without sparsity - config_recipes.add(("baseline", None)) - # Add all quantization techniques without sparsity for quant_config in quantization_recipes: config_recipes.add((quant_config, None)) diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 40bce5c33d..d7229c8a14 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -125,9 +125,13 @@ def __init__( ): self.config = config self.output_dir = config.output_dir - self.baseline_inference_time_in_ms = 0.0 - self.model_inference_time_in_ms = 0.0 - self.speedup = 0.0 + self.eager_baseline_inference_time_in_ms = 0.0 + self.eager_model_inference_time_in_ms = 0.0 + self.compile_baseline_inference_time_in_ms = 0.0 + self.compile_model_inference_time_in_ms = 0.0 + self.eager_speedup_on_baseline = 0.0 + self.compile_speedup_on_baseline = 0.0 + self.compile_speedup_on_eager = 0.0 self.profiler_json_path: Optional[str] = None self.memory_profile_path: Optional[str] = None self.memory_visualization_path: Optional[str] = None @@ -137,9 +141,13 @@ def to_dict(self) -> Dict[str, Any]: """Convert result to dictionary for main function""" result_dict = { **self.config.to_dict(), - "baseline_inference_time_in_ms": self.baseline_inference_time_in_ms, - "model_inference_time_in_ms": self.model_inference_time_in_ms, - "speedup": self.speedup, + "eager_baseline_inference_time_in_ms": self.eager_baseline_inference_time_in_ms, + "eager_model_inference_time_in_ms": self.eager_model_inference_time_in_ms, + "compile_baseline_inference_time_in_ms": self.compile_baseline_inference_time_in_ms, + "compile_model_inference_time_in_ms": self.compile_model_inference_time_in_ms, + "eager speedup on baseline": self.eager_speedup_on_baseline, + "compile speedup on baseline": self.compile_speedup_on_baseline, + "eager vs compile speedup": self.compile_speedup_on_eager, "profiler_json_path": self.profiler_json_path, "memory_profile_path": self.memory_profile_path, "memory_visualization_path": self.memory_visualization_path, @@ -408,9 +416,13 @@ def print_results(results: List[BenchmarkResult]): result.config.quantization or "baseline", result.config.sparsity or "none", f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})", - f"{result.baseline_inference_time_in_ms:.2f}", - f"{result.model_inference_time_in_ms:.2f}", - f"{result.speedup:.2f}x", + f"{result.eager_baseline_inference_time_in_ms:.2f}", + f"{result.eager_model_inference_time_in_ms:.2f}", + f"{result.eager_speedup_on_baseline:.2f}x", + f"{result.compile_baseline_inference_time_in_ms:.2f}", + f"{result.compile_model_inference_time_in_ms:.2f}", + f"{result.compile_speedup_on_baseline:.2f}x", + f"{result.compile_speedup_on_eager:.2f}x", str(result.config.enable_profiler), ] @@ -422,9 +434,13 @@ def print_results(results: List[BenchmarkResult]): "Quantization", "Sparsity", "Shape", - "Baseline Inference Time (ms)", - "Inference Time (ms)", - "Speedup", + "Eager Baseline Inference Time (ms)", + "Eager Model Inference Time (ms)", + "Eager Speedup", + "Compile Baseline Inference Time (ms)", + "Compile Model Inference Time (ms)", + "Compile Speedup", + "Eager vs Compile Speedup", "Profiler Enabled", ] diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py index f59a1271b1..9f02bbebc5 100644 --- a/torchao/testing/model_architectures.py +++ b/torchao/testing/model_architectures.py @@ -156,6 +156,7 @@ def create_model_and_input_data( high_precision_dtype (torch.dtype): data type of the model m, k, n (int): dimensions of the model and input data """ + torch.manual_seed(42) if model_type == "linear": model = ToyLinearModel(k, n, high_precision_dtype).to(device) input_data = torch.randn(m, k, device=device, dtype=high_precision_dtype) From 4da5639aec20e33334cc0d0e348710f7ff97c14a Mon Sep 17 00:00:00 2001 From: jainapurva Date: Sun, 27 Jul 2025 18:29:19 -0700 Subject: [PATCH 02/11] Calculate both compile and eager by default --- .../dashboard/microbenchmark_quantization_config.yml | 1 - benchmarks/microbenchmarks/test/benchmark_config.yml | 4 ---- .../microbenchmarks/test/test_benchmark_inference.py | 3 --- .../microbenchmarks/test/test_benchmark_runner.py | 2 -- benchmarks/microbenchmarks/test/test_utils.py | 2 -- benchmarks/microbenchmarks/utils.py | 10 ++-------- docs/source/benchmarking_api_guide.md | 6 ++---- 7 files changed, 4 insertions(+), 24 deletions(-) diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml index 774237d54c..8156422668 100644 --- a/benchmarks/dashboard/microbenchmark_quantization_config.yml +++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml @@ -14,7 +14,6 @@ model_params: min_power: 10 max_power: 15 high_precision_dtype: "torch.bfloat16" - use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" model_type: "linear" diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 4fd5eb2018..40db49e223 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -13,7 +13,6 @@ model_params: min_power: 14 max_power: 16 high_precision_dtype: "torch.bfloat16" - use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" model_type: "linear" @@ -27,7 +26,6 @@ model_params: [2048, 4096, 1024], ] high_precision_dtype: "torch.bfloat16" - use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" model_type: "ln_linear_sigmoid" @@ -41,7 +39,6 @@ model_params: [2048, 4096, 1024], # For transformer_block, k is the hidden dimension ] high_precision_dtype: "torch.bfloat16" - use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition) @@ -58,7 +55,6 @@ model_params: min_power: 10 # 1024 max_power: 11 # 2048 high_precision_dtype: "torch.bfloat16" - use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" model_type: "linear" diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py index 22863dcbcf..e0f55a6aca 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py @@ -21,7 +21,6 @@ def setUp(self): sparsity="semi-sparse", params={ "high_precision_dtype": "torch.float32", - "use_torch_compile": False, "device": "cpu", "model_type": "linear", }, @@ -64,7 +63,6 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config): sparsity="semi-sparse", params={ "high_precision_dtype": "torch.float32", - "use_torch_compile": False, "device": "cpu", "model_type": "linear", }, @@ -92,7 +90,6 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config): sparsity="block", params={ "high_precision_dtype": "torch.float32", - "use_torch_compile": False, "device": "cpu", "model_type": "linear", }, diff --git a/benchmarks/microbenchmarks/test/test_benchmark_runner.py b/benchmarks/microbenchmarks/test/test_benchmark_runner.py index 2f7e5ba541..f7e54e4bec 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_runner.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_runner.py @@ -39,7 +39,6 @@ def setUp(self): } ], "high_precision_dtype": "torch.bfloat16", - "use_torch_compile": True, "torch_compile_mode": "max-autotune", "device": "cpu", "model_type": "linear", @@ -130,7 +129,6 @@ def test_get_param_combinations(self): self.assertEqual(len(shapes), 1) self.assertEqual(shapes[0], ("custom", [1024, 1024, 1024])) self.assertEqual(params["high_precision_dtype"], "torch.bfloat16") - self.assertEqual(params["use_torch_compile"], True) @patch("argparse.Namespace") def test_load_benchmark_configs(self, mock_args): diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py index 06f557a8f4..5d21260bf9 100644 --- a/benchmarks/microbenchmarks/test/test_utils.py +++ b/benchmarks/microbenchmarks/test/test_utils.py @@ -33,7 +33,6 @@ def setUp(self): self.test_params = { "name": "test_model", "high_precision_dtype": "torch.bfloat16", - "use_torch_compile": True, "torch_compile_mode": "max-autotune", "device": "cpu", "model_type": "linear", @@ -57,7 +56,6 @@ def test_benchmark_config(self): self.assertEqual(config.k, 1024) self.assertEqual(config.n, 1024) self.assertEqual(config.high_precision_dtype, torch.bfloat16) - self.assertEqual(config.use_torch_compile, True) self.assertEqual(config.torch_compile_mode, "max-autotune") self.assertEqual(config.device, "cpu") self.assertEqual(config.model_type, "linear") diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index d7229c8a14..94c6f19b81 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -73,18 +73,13 @@ def __init__( self.high_precision_dtype = self._parse_precision( params.get("high_precision_dtype", "torch.bfloat16") ) - self.use_torch_compile = bool(params.get("use_torch_compile", False)) - self.torch_compile_mode = ( - params.get("torch_compile_mode", "default") - if self.use_torch_compile - else None - ) + self.torch_compile_mode = params.get("torch_compile_mode", "default") self.device = get_default_device(params.get("device", None)) self.model_type = params.get("model_type", "linear") self.output_dir = f"{output_dir}/{self.benchmark_mode}" self.name = params.get( "name", - f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}", + f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile'}", ) self.enable_profiler = bool(params.get("enable_profiler", False)) self.enable_memory_profiler = bool(params.get("enable_memory_profiler", False)) @@ -108,7 +103,6 @@ def to_dict(self) -> Dict[str, Any]: "k": self.k, "n": self.n, "high_precision_dtype": self.high_precision_dtype, - "use_torch_compile": self.use_torch_compile, "torch_compile_mode": self.torch_compile_mode, "device": self.device, "model_type": self.model_type, diff --git a/docs/source/benchmarking_api_guide.md b/docs/source/benchmarking_api_guide.md index b07a0e14ff..bd81a7f65f 100644 --- a/docs/source/benchmarking_api_guide.md +++ b/docs/source/benchmarking_api_guide.md @@ -122,7 +122,6 @@ model_params: min_power: 10 max_power: 15 high_precision_dtype: "torch.bfloat16" - use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" model_type: "linear" @@ -199,9 +198,8 @@ python -m unittest discover benchmarks/microbenchmarks/test ### Common Issues 1. **CUDA Out of Memory**: Reduce batch size or matrix dimensions -2. **Compilation Errors**: Set `use_torch_compile: false` for debugging -3. **Missing Quantization Methods**: Ensure TorchAO is properly installed -4. **Device Not Available**: Check device availability and drivers +2. **Missing Quantization Methods**: Ensure TorchAO is properly installed +3. **Device Not Available**: Check device availability and drivers ### Best Practices From 804bea7719e977b40309f41d4897de926bb9f2de Mon Sep 17 00:00:00 2001 From: jainapurva Date: Sun, 27 Jul 2025 18:43:02 -0700 Subject: [PATCH 03/11] Updates --- .../microbenchmarks/benchmark_inference.py | 270 +++++++++++++++++- 1 file changed, 268 insertions(+), 2 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index ecddc88b5f..47aff4d7be 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -1,5 +1,271 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# -# This source code is licensed under the BSD 3-Clause license found in the + +# This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. +""" +Inference benchmark runner + +This script runs inference benchmarks and generates a micro-benchmarking report for it. +- run() function is the main entry point for running inference benchmarks. +""" + +import os +from copy import deepcopy +from pathlib import Path +from typing import Dict, Tuple + +import torch + +from benchmarks.microbenchmarks.profiler import ( + generate_memory_profile, + generate_model_profile, + visualize_memory_profile, +) +from benchmarks.microbenchmarks.utils import ( + BenchmarkConfig, + BenchmarkResult, + clean_caches, + model_inference_time_in_ms, + string_to_config, +) +from torchao.quantization import quantize_ +from torchao.sparsity.sparse_api import sparsify_ +from torchao.testing.model_architectures import ( + create_model_and_input_data, +) + +# ----------------------------------------------------------------------------- +# Baseline caching +# +# ``_BASELINE_CACHE`` maps a unique key to a tuple +# ``(eager_baseline_time, compile_baseline_time)``. See ``_make_cache_key`` for the key +# construction. Users should not access this cache directly; it is +# internal to this module. The cache intentionally holds the +# uncompiled base model so that quantized versions can be derived +# without mutating the cached copy. + +_BASELINE_CACHE: Dict[Tuple, Tuple[float, float]] = {} + + +def _make_cache_key(config: BenchmarkConfig) -> Tuple: + """Create a key for caching based on benchmark configuration. + + Parameters that affect baseline performance are included: + + * model type (e.g. ``linear`` or ``transformer_block``) + * shape dimensions (m, k, n) + * high precision dtype (bf16, fp16, etc.) + * device (cuda, cpu, mps) + * compile settings (whether compile is enabled and compile mode) + + Sparsity and quantization settings are deliberately excluded + because the baseline (non‑quantized, non‑sparse) performance is + independent of those attributes. + """ + return ( + config.model_type, + config.m, + config.k, + config.n, + config.high_precision_dtype, + config.device, + config.torch_compile_mode, + ) + + +def run(config: BenchmarkConfig) -> BenchmarkResult: + """ + Run inference benchmarks. + + The function first checks if a baseline for the given configuration + already exists in the internal cache. If not, it measures the baseline + inference time and stores the result. When the baseline is cached, + the function reuses the stored model and input data to + benchmark quantized variants, avoiding redundant baseline measurements. + + Args: + config (BenchmarkConfig): Benchmark configuration. + + Returns: + BenchmarkResult: Result of the benchmark. + """ + try: + clean_caches() # Clean caches + + # Create output directory if it doesn't exist + Path(config.output_dir).mkdir(parents=True, exist_ok=True) + + # Prepare result container + result = BenchmarkResult(config=config) + + # Create model and input data + base_model, input_data = create_model_and_input_data( + config.model_type, + config.m, + config.k, + config.n, + high_precision_dtype=config.high_precision_dtype, + device=config.device, + ) + + # Generate a cache key for the current configuration + cache_key = _make_cache_key(config) + + # Check if the baseline for this configuration has been computed + if cache_key not in _BASELINE_CACHE: + # Switch model to eval and move to device + base_model = base_model.eval().to(config.device) + print("Benchmarking eager baseline inference.....") + eager_baseline_time = model_inference_time_in_ms( + model=base_model, input_data=input_data + ) + + print("Benchmarking compile baseline inference.....") + base_model = torch.compile( + base_model, mode=config.torch_compile_mode, fullgraph=True + ) + compile_baseline_time = model_inference_time_in_ms( + model=base_model, input_data=input_data + ) + + # Store uncompiled model, input and baseline time + _BASELINE_CACHE[cache_key] = (eager_baseline_time, compile_baseline_time) + + result.eager_baseline_inference_time_in_ms = eager_baseline_time + result.compile_baseline_inference_time_in_ms = compile_baseline_time + else: + # Retrieve cached values + cached_eager_time, cached_compile_time = _BASELINE_CACHE[cache_key] + result.eager_baseline_inference_time_in_ms = cached_eager_time + result.compile_baseline_inference_time_in_ms = cached_compile_time + + # At this point, ``base_model`` is an uncompiled model ready for quantization, + # and ``input_data`` is the corresponding input tensor. The baseline time + # has been stored in ``result.baseline_inference_time_in_ms``. + + # Copy base model for quantizing/sparsifying + m_copy = deepcopy(base_model) + + # Determine quantization/sparsity configuration + ao_base_config = string_to_config( + config.quantization, + config.sparsity, + high_precision_dtype=config.high_precision_dtype, + ) + + # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) + is_cuda = config.device == "cuda" and torch.cuda.is_available() + + if config.sparsity is not None and ( + config.quantization is None or "baseline" in config.quantization + ): + if is_cuda: + print(f"Applying {config.sparsity} sparsity to model") + sparsify_(m_copy, ao_base_config) + else: + print( + f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" + ) + elif config.sparsity is None and ( + config.quantization is None or "baseline" in config.quantization + ): + pass # No quantization or sparsity specified, do nothing + else: + print("Quantizing model....") + m_copy = m_copy.eval().to(config.device) + quantize_(m_copy, ao_base_config) + + # Store result in model for memory profiling + m_copy._benchmark_result = result + + # Measure inference time for quantized model + print("Benchmarking eager quantized model.....") + result.eager_model_inference_time_in_ms = model_inference_time_in_ms( + model=m_copy, input_data=input_data + ) + + # Measure inference time for compiled quantized model + print("Benchmarking quantized model.....") + m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True) + result.compile_model_inference_time_in_ms = model_inference_time_in_ms( + model=m_copy, input_data=input_data + ) + + # Compute eager speedup relative to baseline + result.eager_speedup_on_baseline = round( + result.eager_baseline_inference_time_in_ms + / result.eager_model_inference_time_in_ms, + 2, + ) + # Compute compile speedup relative to baseline + result.compile_speedup_on_baseline = round( + result.compile_baseline_inference_time_in_ms + / result.compile_model_inference_time_in_ms, + 2, + ) + # Compute compile speedup for quantized model relative to eager quantized model + result.compile_speedup_on_eager = round( + result.eager_model_inference_time_in_ms + / result.compile_model_inference_time_in_ms, + 2, + ) + + # Run profiler if enabled + if config.enable_profiler: + print("Running profiler...") + try: + profiler_json_path = generate_model_profile( + model=m_copy, + input_data=input_data, + profile_file_path=os.path.join( + config.output_dir, + "profiler", + f"{config._file_name}_profile.json", + ), + ) + result.profiler_json_path = profiler_json_path + except Exception as e: + print(f"Error running profiler: {e}") + + # Run memory profiler if enabled + if config.enable_memory_profiler: + print("Running memory profiler...") + try: + # Create memory profiler directory if it doesn't exist + memory_profiler_dir = os.path.join( + config.output_dir, "memory_profiler/pickle" + ) + os.makedirs(memory_profiler_dir, exist_ok=True) + + # Save memory profile with .pickle extension + result.memory_profile_path, result.memory_stats = ( + generate_memory_profile( + model=m_copy, + input_data=input_data, + profile_file_path=os.path.join( + memory_profiler_dir, + f"{config._file_name}_memory_profile.pickle", + ), + ) + ) + + if result.memory_profile_path: + result.memory_visualization_path = visualize_memory_profile( + result.memory_profile_path + ) + except ValueError as e: + if "not enough values to unpack" in str(e): + print( + "Failed due to existing bugs, re‑run the code to generate memory profile. Please raise an issue if it persists." + ) + except Exception as e: + print(f"Error running memory profiler: {e}") + import traceback + + traceback.print_exc() + + return result + except Exception as e: + print(f"Error in benchmark run: {config.name} with error: {e}") + return None From a193e4cf90a2e8a60a85dbb67ee9987977cad389 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Sun, 27 Jul 2025 18:45:48 -0700 Subject: [PATCH 04/11] Updates --- benchmarks/microbenchmarks/benchmark_inference.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index 47aff4d7be..b0d617f32d 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -81,8 +81,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: The function first checks if a baseline for the given configuration already exists in the internal cache. If not, it measures the baseline inference time and stores the result. When the baseline is cached, - the function reuses the stored model and input data to - benchmark quantized variants, avoiding redundant baseline measurements. + the function reuses the cached baselines to calculate speedup metrics. Args: config (BenchmarkConfig): Benchmark configuration. From d0b318fea309312c6e7a50824ec7cc728ca4e8c7 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Sun, 27 Jul 2025 23:37:37 -0700 Subject: [PATCH 05/11] Update CI run --- .../dashboard/ci_microbenchmark_runner.py | 72 ++++++++++++++++--- 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py index a8b7ae048d..3fda108f20 100644 --- a/benchmarks/dashboard/ci_microbenchmark_runner.py +++ b/benchmarks/dashboard/ci_microbenchmark_runner.py @@ -121,21 +121,23 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: if result is not None: # Create benchmark result in OSS format - speedup_result = create_benchmark_result( + + ## Compile mode results + compile_speedup_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Fwd Speedup (x)", - metric_values=[result.speedup], + metric_values=[result.compile_speedup_on_baseline], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode, ) - results.append(speedup_result) - baseline_time_result = create_benchmark_result( + results.append(compile_speedup_result) + compile_baseline_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Bfloat16 Fwd Time (ms)", - metric_values=[result.baseline_inference_time_in_ms], + metric_values=[result.compile_baseline_inference_time_in_ms], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode, @@ -143,12 +145,37 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: "unit": "ms", }, ) - results.append(baseline_time_result) - quantize_time_result = create_benchmark_result( + results.append(compile_baseline_time_result) + compile_quantize_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Quantized Fwd Time (ms)", - metric_values=[result.model_inference_time_in_ms], + metric_values=[result.compile_model_inference_time_in_ms], + quant_type=config.quantization, + device=config.device, + torch_compile_mode=config.torch_compile_mode, + metric_extra_info={ + "unit": "ms", + }, + ) + results.append(compile_quantize_time_result) + + ## Eager mode results + eager_speedup_result = create_benchmark_result( + benchmark_name="TorchAO Quantization Benchmark", + shape=[config.m, config.k, config.n], + metric_name="Fwd Speedup (x)", + metric_values=[result.eager_speedup_on_baseline], + quant_type=config.quantization, + device=config.device, + torch_compile_mode=config.torch_compile_mode, + ) + results.append(eager_speedup_result) + eager_baseline_time_result = create_benchmark_result( + benchmark_name="TorchAO Quantization Benchmark", + shape=[config.m, config.k, config.n], + metric_name="Bfloat16 Fwd Time (ms)", + metric_values=[result.eager_baseline_inference_time_in_ms], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode, @@ -156,7 +183,34 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: "unit": "ms", }, ) - results.append(quantize_time_result) + results.append(eager_baseline_time_result) + eager_quantize_time_result = create_benchmark_result( + benchmark_name="TorchAO Quantization Benchmark", + shape=[config.m, config.k, config.n], + metric_name="Quantized Fwd Time (ms)", + metric_values=[result.eager_model_inference_time_in_ms], + quant_type=config.quantization, + device=config.device, + torch_compile_mode=config.torch_compile_mode, + metric_extra_info={ + "unit": "ms", + }, + ) + results.append(eager_quantize_time_result) + + ## Compile vs eager results + compile_eager_speedup_result = create_benchmark_result( + benchmark_name="TorchAO Quantization Benchmark", + shape=[config.m, config.k, config.n], + metric_name="Eager vs Compile Fwd Speedup (x)", + metric_values=[result.compile_speedup_on_eager], + quant_type=config.quantization, + device=config.device, + torch_compile_mode=config.torch_compile_mode, + ) + results.append(compile_eager_speedup_result) + + ## Memory results allocated_memory_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], From 63cd52495800895195e260a9ff3e6af114ba2750 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Mon, 28 Jul 2025 09:42:45 -0700 Subject: [PATCH 06/11] Update column names --- benchmarks/dashboard/ci_microbenchmark_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py index 3fda108f20..dc9c1b4963 100644 --- a/benchmarks/dashboard/ci_microbenchmark_runner.py +++ b/benchmarks/dashboard/ci_microbenchmark_runner.py @@ -164,7 +164,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: eager_speedup_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], - metric_name="Fwd Speedup (x)", + metric_name="Fwd Speedup w/ Eager (x)", metric_values=[result.eager_speedup_on_baseline], quant_type=config.quantization, device=config.device, @@ -174,7 +174,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: eager_baseline_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], - metric_name="Bfloat16 Fwd Time (ms)", + metric_name="Bfloat16 Fwd Time w/ Eager (ms)", metric_values=[result.eager_baseline_inference_time_in_ms], quant_type=config.quantization, device=config.device, @@ -187,7 +187,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: eager_quantize_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], - metric_name="Quantized Fwd Time (ms)", + metric_name="Quantized Fwd Time w/ Eager (ms)", metric_values=[result.eager_model_inference_time_in_ms], quant_type=config.quantization, device=config.device, From 5a35513cbb92d7e3ba888a4f2805cd73a9e5048a Mon Sep 17 00:00:00 2001 From: jainapurva Date: Mon, 28 Jul 2025 12:21:40 -0700 Subject: [PATCH 07/11] update comments --- benchmarks/dashboard/ci_microbenchmark_runner.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py index dc9c1b4963..d492712d85 100644 --- a/benchmarks/dashboard/ci_microbenchmark_runner.py +++ b/benchmarks/dashboard/ci_microbenchmark_runner.py @@ -120,9 +120,9 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: result = run_inference(config) if result is not None: - # Create benchmark result in OSS format + ## Create benchmark result in OSS format - ## Compile mode results + # Compile mode speedup compile_speedup_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], @@ -133,6 +133,8 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: torch_compile_mode=config.torch_compile_mode, ) results.append(compile_speedup_result) + + # Compile mode baseline compile_baseline_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], @@ -146,6 +148,8 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: }, ) results.append(compile_baseline_time_result) + + # Compile mode quantized compile_quantize_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], @@ -160,7 +164,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: ) results.append(compile_quantize_time_result) - ## Eager mode results + # Eager mode speedup eager_speedup_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], @@ -171,6 +175,8 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: torch_compile_mode=config.torch_compile_mode, ) results.append(eager_speedup_result) + + # Eager mode baseline eager_baseline_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], @@ -184,6 +190,8 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: }, ) results.append(eager_baseline_time_result) + + # Eager mode quantized eager_quantize_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], From b089e30c9f3730f15c8c766019bcd53d465c950a Mon Sep 17 00:00:00 2001 From: jainapurva Date: Mon, 28 Jul 2025 12:29:07 -0700 Subject: [PATCH 08/11] test updates --- .../microbenchmarks/test/test_benchmark_inference.py | 6 +++--- .../microbenchmarks/test/test_benchmark_profiler.py | 11 +++++------ benchmarks/microbenchmarks/test/test_utils.py | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py index e0f55a6aca..38ffcc5a6c 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py @@ -45,7 +45,7 @@ def test_run_inference(self, mock_string_to_config): result = run(self.config) self.assertIsInstance(result, BenchmarkResult) - self.assertTrue(hasattr(result, "model_inference_time_in_ms")) + self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms")) @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config") def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config): @@ -73,7 +73,7 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config): ) result = run(config) self.assertIsInstance(result, BenchmarkResult) - self.assertTrue(hasattr(result, "model_inference_time_in_ms")) + self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms")) @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config") def test_run_inference_with_block_sparsity(self, mock_string_to_config): @@ -100,7 +100,7 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config): ) result = run(config) self.assertIsInstance(result, BenchmarkResult) - self.assertTrue(hasattr(result, "model_inference_time_in_ms")) + self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms")) if __name__ == "__main__": diff --git a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py index 92689c4802..d0c36d8cfe 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py @@ -270,13 +270,12 @@ def test_memory_profiler_cuda_unavailable(self): f"{config.name}_{self.m}_{self.k}_{self.n}_memory_profile.json", ) - # Generate memory profile - result, memory_stats = generate_memory_profile( - self.model, self.input_data, memory_profile_path - ) - # Should return None when CUDA is unavailable - self.assertIsNone(result) + self.assertIsNone( + generate_memory_profile( + self.model, self.input_data, memory_profile_path + ) + ) # Should not create file when CUDA is unavailable self.assertFalse(os.path.exists(memory_profile_path)) diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py index 5d21260bf9..864c521251 100644 --- a/benchmarks/microbenchmarks/test/test_utils.py +++ b/benchmarks/microbenchmarks/test/test_utils.py @@ -74,7 +74,7 @@ def test_benchmark_result(self): result = BenchmarkResult(config=config) self.assertEqual(result.config, config) - self.assertEqual(result.model_inference_time_in_ms, 0.0) + self.assertEqual(result.compile_model_inference_time_in_ms, 0.0) def test_get_default_device(self): # Test CPU fallback From 28f3f6a4a83160863746bab908e091f494b91314 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Fri, 1 Aug 2025 14:30:35 -0700 Subject: [PATCH 09/11] remove dashboard updates' --- .../dashboard/ci_microbenchmark_runner.py | 82 +++---------------- .../microbenchmark_quantization_config.yml | 1 + 2 files changed, 11 insertions(+), 72 deletions(-) diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py index d492712d85..a8b7ae048d 100644 --- a/benchmarks/dashboard/ci_microbenchmark_runner.py +++ b/benchmarks/dashboard/ci_microbenchmark_runner.py @@ -120,26 +120,22 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: result = run_inference(config) if result is not None: - ## Create benchmark result in OSS format - - # Compile mode speedup - compile_speedup_result = create_benchmark_result( + # Create benchmark result in OSS format + speedup_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Fwd Speedup (x)", - metric_values=[result.compile_speedup_on_baseline], + metric_values=[result.speedup], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode, ) - results.append(compile_speedup_result) - - # Compile mode baseline - compile_baseline_time_result = create_benchmark_result( + results.append(speedup_result) + baseline_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Bfloat16 Fwd Time (ms)", - metric_values=[result.compile_baseline_inference_time_in_ms], + metric_values=[result.baseline_inference_time_in_ms], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode, @@ -147,41 +143,12 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: "unit": "ms", }, ) - results.append(compile_baseline_time_result) - - # Compile mode quantized - compile_quantize_time_result = create_benchmark_result( + results.append(baseline_time_result) + quantize_time_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Quantized Fwd Time (ms)", - metric_values=[result.compile_model_inference_time_in_ms], - quant_type=config.quantization, - device=config.device, - torch_compile_mode=config.torch_compile_mode, - metric_extra_info={ - "unit": "ms", - }, - ) - results.append(compile_quantize_time_result) - - # Eager mode speedup - eager_speedup_result = create_benchmark_result( - benchmark_name="TorchAO Quantization Benchmark", - shape=[config.m, config.k, config.n], - metric_name="Fwd Speedup w/ Eager (x)", - metric_values=[result.eager_speedup_on_baseline], - quant_type=config.quantization, - device=config.device, - torch_compile_mode=config.torch_compile_mode, - ) - results.append(eager_speedup_result) - - # Eager mode baseline - eager_baseline_time_result = create_benchmark_result( - benchmark_name="TorchAO Quantization Benchmark", - shape=[config.m, config.k, config.n], - metric_name="Bfloat16 Fwd Time w/ Eager (ms)", - metric_values=[result.eager_baseline_inference_time_in_ms], + metric_values=[result.model_inference_time_in_ms], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode, @@ -189,36 +156,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: "unit": "ms", }, ) - results.append(eager_baseline_time_result) - - # Eager mode quantized - eager_quantize_time_result = create_benchmark_result( - benchmark_name="TorchAO Quantization Benchmark", - shape=[config.m, config.k, config.n], - metric_name="Quantized Fwd Time w/ Eager (ms)", - metric_values=[result.eager_model_inference_time_in_ms], - quant_type=config.quantization, - device=config.device, - torch_compile_mode=config.torch_compile_mode, - metric_extra_info={ - "unit": "ms", - }, - ) - results.append(eager_quantize_time_result) - - ## Compile vs eager results - compile_eager_speedup_result = create_benchmark_result( - benchmark_name="TorchAO Quantization Benchmark", - shape=[config.m, config.k, config.n], - metric_name="Eager vs Compile Fwd Speedup (x)", - metric_values=[result.compile_speedup_on_eager], - quant_type=config.quantization, - device=config.device, - torch_compile_mode=config.torch_compile_mode, - ) - results.append(compile_eager_speedup_result) - - ## Memory results + results.append(quantize_time_result) allocated_memory_result = create_benchmark_result( benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml index 8156422668..774237d54c 100644 --- a/benchmarks/dashboard/microbenchmark_quantization_config.yml +++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml @@ -14,6 +14,7 @@ model_params: min_power: 10 max_power: 15 high_precision_dtype: "torch.bfloat16" + use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" model_type: "linear" From d7aa7abc708300a7678eb75e17f421cefef415d4 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Mon, 4 Aug 2025 12:58:12 -0700 Subject: [PATCH 10/11] updates --- .../microbenchmarks/benchmark_inference.py | 51 ++++++++++--------- .../test/test_benchmark_inference.py | 12 +++-- benchmarks/microbenchmarks/test/test_utils.py | 2 +- benchmarks/microbenchmarks/utils.py | 24 ++++----- 4 files changed, 48 insertions(+), 41 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index b0d617f32d..7aa710da77 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -38,12 +38,12 @@ # ----------------------------------------------------------------------------- # Baseline caching # -# ``_BASELINE_CACHE`` maps a unique key to a tuple +# ``_BASELINE_CACHE`` maps a unique key constructed using _make_cache_key(config) -> (model_type, m, k, n, high_precision_dtype, device, torch_compile_mode) to a tuple # ``(eager_baseline_time, compile_baseline_time)``. See ``_make_cache_key`` for the key # construction. Users should not access this cache directly; it is -# internal to this module. The cache intentionally holds the -# uncompiled base model so that quantized versions can be derived -# without mutating the cached copy. +# internal to this module. +# Eg: (linear, 1024, 1024, 1024, torch.bfloat16, cuda, default) -> (95.00, 56.00) +# ----------------------------------------------------------------------------- _BASELINE_CACHE: Dict[Tuple, Tuple[float, float]] = {} @@ -114,30 +114,31 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: # Check if the baseline for this configuration has been computed if cache_key not in _BASELINE_CACHE: # Switch model to eval and move to device - base_model = base_model.eval().to(config.device) + m_copy = deepcopy(base_model) + m_copy = m_copy.eval().to(config.device) print("Benchmarking eager baseline inference.....") eager_baseline_time = model_inference_time_in_ms( - model=base_model, input_data=input_data + model=m_copy, input_data=input_data ) print("Benchmarking compile baseline inference.....") - base_model = torch.compile( - base_model, mode=config.torch_compile_mode, fullgraph=True + m_copy = torch.compile( + m_copy, mode=config.torch_compile_mode, fullgraph=True ) compile_baseline_time = model_inference_time_in_ms( - model=base_model, input_data=input_data + model=m_copy, input_data=input_data ) # Store uncompiled model, input and baseline time _BASELINE_CACHE[cache_key] = (eager_baseline_time, compile_baseline_time) - result.eager_baseline_inference_time_in_ms = eager_baseline_time - result.compile_baseline_inference_time_in_ms = compile_baseline_time + result.baseline_model_eager_inference_time_in_ms = eager_baseline_time + result.baseline_model_compiled_inference_time_in_ms = compile_baseline_time else: # Retrieve cached values cached_eager_time, cached_compile_time = _BASELINE_CACHE[cache_key] - result.eager_baseline_inference_time_in_ms = cached_eager_time - result.compile_baseline_inference_time_in_ms = cached_compile_time + result.baseline_model_eager_inference_time_in_ms = cached_eager_time + result.baseline_model_compiled_inference_time_in_ms = cached_compile_time # At this point, ``base_model`` is an uncompiled model ready for quantization, # and ``input_data`` is the corresponding input tensor. The baseline time @@ -180,34 +181,34 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: # Measure inference time for quantized model print("Benchmarking eager quantized model.....") - result.eager_model_inference_time_in_ms = model_inference_time_in_ms( + result.quantized_model_eager_inference_time_in_ms = model_inference_time_in_ms( model=m_copy, input_data=input_data ) # Measure inference time for compiled quantized model print("Benchmarking quantized model.....") m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True) - result.compile_model_inference_time_in_ms = model_inference_time_in_ms( - model=m_copy, input_data=input_data + result.quantized_model_compiled_inference_time_in_ms = ( + model_inference_time_in_ms(model=m_copy, input_data=input_data) ) # Compute eager speedup relative to baseline result.eager_speedup_on_baseline = round( - result.eager_baseline_inference_time_in_ms - / result.eager_model_inference_time_in_ms, - 2, + result.baseline_model_eager_inference_time_in_ms + / result.quantized_model_eager_inference_time_in_ms, + ndigits=2, ) # Compute compile speedup relative to baseline result.compile_speedup_on_baseline = round( - result.compile_baseline_inference_time_in_ms - / result.compile_model_inference_time_in_ms, - 2, + result.baseline_model_compiled_inference_time_in_ms + / result.quantized_model_compiled_inference_time_in_ms, + ndigits=2, ) # Compute compile speedup for quantized model relative to eager quantized model result.compile_speedup_on_eager = round( - result.eager_model_inference_time_in_ms - / result.compile_model_inference_time_in_ms, - 2, + result.quantized_model_eager_inference_time_in_ms + / result.quantized_model_compiled_inference_time_in_ms, + ndigits=2, ) # Run profiler if enabled diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py index 38ffcc5a6c..a2798799a6 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py @@ -45,7 +45,9 @@ def test_run_inference(self, mock_string_to_config): result = run(self.config) self.assertIsInstance(result, BenchmarkResult) - self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms")) + self.assertTrue( + hasattr(result, "quantized_model_compiled_inference_time_in_ms") + ) @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config") def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config): @@ -73,7 +75,9 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config): ) result = run(config) self.assertIsInstance(result, BenchmarkResult) - self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms")) + self.assertTrue( + hasattr(result, "quantized_model_compiled_inference_time_in_ms") + ) @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config") def test_run_inference_with_block_sparsity(self, mock_string_to_config): @@ -100,7 +104,9 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config): ) result = run(config) self.assertIsInstance(result, BenchmarkResult) - self.assertTrue(hasattr(result, "compile_model_inference_time_in_ms")) + self.assertTrue( + hasattr(result, "quantized_model_compiled_inference_time_in_ms") + ) if __name__ == "__main__": diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py index 864c521251..64af5b67e6 100644 --- a/benchmarks/microbenchmarks/test/test_utils.py +++ b/benchmarks/microbenchmarks/test/test_utils.py @@ -74,7 +74,7 @@ def test_benchmark_result(self): result = BenchmarkResult(config=config) self.assertEqual(result.config, config) - self.assertEqual(result.compile_model_inference_time_in_ms, 0.0) + self.assertEqual(result.quantized_model_compiled_inference_time_in_ms, 0.0) def test_get_default_device(self): # Test CPU fallback diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 94c6f19b81..e50f5a065c 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -119,10 +119,10 @@ def __init__( ): self.config = config self.output_dir = config.output_dir - self.eager_baseline_inference_time_in_ms = 0.0 - self.eager_model_inference_time_in_ms = 0.0 - self.compile_baseline_inference_time_in_ms = 0.0 - self.compile_model_inference_time_in_ms = 0.0 + self.baseline_model_eager_inference_time_in_ms = 0.0 + self.quantized_model_eager_inference_time_in_ms = 0.0 + self.baseline_model_compiled_inference_time_in_ms = 0.0 + self.quantized_model_compiled_inference_time_in_ms = 0.0 self.eager_speedup_on_baseline = 0.0 self.compile_speedup_on_baseline = 0.0 self.compile_speedup_on_eager = 0.0 @@ -135,10 +135,10 @@ def to_dict(self) -> Dict[str, Any]: """Convert result to dictionary for main function""" result_dict = { **self.config.to_dict(), - "eager_baseline_inference_time_in_ms": self.eager_baseline_inference_time_in_ms, - "eager_model_inference_time_in_ms": self.eager_model_inference_time_in_ms, - "compile_baseline_inference_time_in_ms": self.compile_baseline_inference_time_in_ms, - "compile_model_inference_time_in_ms": self.compile_model_inference_time_in_ms, + "baseline_model_eager_inference_time_in_ms": self.baseline_model_eager_inference_time_in_ms, + "quantized_model_eager_inference_time_in_ms": self.quantized_model_eager_inference_time_in_ms, + "baseline_model_compiled_inference_time_in_ms": self.baseline_model_compiled_inference_time_in_ms, + "quantized_model_compiled_inference_time_in_ms": self.quantized_model_compiled_inference_time_in_ms, "eager speedup on baseline": self.eager_speedup_on_baseline, "compile speedup on baseline": self.compile_speedup_on_baseline, "eager vs compile speedup": self.compile_speedup_on_eager, @@ -410,11 +410,11 @@ def print_results(results: List[BenchmarkResult]): result.config.quantization or "baseline", result.config.sparsity or "none", f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})", - f"{result.eager_baseline_inference_time_in_ms:.2f}", - f"{result.eager_model_inference_time_in_ms:.2f}", + f"{result.baseline_model_eager_inference_time_in_ms:.2f}", + f"{result.quantized_model_eager_inference_time_in_ms:.2f}", f"{result.eager_speedup_on_baseline:.2f}x", - f"{result.compile_baseline_inference_time_in_ms:.2f}", - f"{result.compile_model_inference_time_in_ms:.2f}", + f"{result.baseline_model_compiled_inference_time_in_ms:.2f}", + f"{result.quantized_model_compiled_inference_time_in_ms:.2f}", f"{result.compile_speedup_on_baseline:.2f}x", f"{result.compile_speedup_on_eager:.2f}x", str(result.config.enable_profiler), From 7ca80b2221bb8484eea3448313f1d0e7b23a4260 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 5 Aug 2025 12:18:28 -0700 Subject: [PATCH 11/11] updates --- benchmarks/dashboard/ci_microbenchmark_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py index a8b7ae048d..e6665caa53 100644 --- a/benchmarks/dashboard/ci_microbenchmark_runner.py +++ b/benchmarks/dashboard/ci_microbenchmark_runner.py @@ -125,7 +125,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Fwd Speedup (x)", - metric_values=[result.speedup], + metric_values=[result.compile_speedup_on_baseline], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode, @@ -135,7 +135,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Bfloat16 Fwd Time (ms)", - metric_values=[result.baseline_inference_time_in_ms], + metric_values=[result.baseline_model_compiled_inference_time_in_ms], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode, @@ -148,7 +148,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]: benchmark_name="TorchAO Quantization Benchmark", shape=[config.m, config.k, config.n], metric_name="Quantized Fwd Time (ms)", - metric_values=[result.model_inference_time_in_ms], + metric_values=[result.quantized_model_compiled_inference_time_in_ms], quant_type=config.quantization, device=config.device, torch_compile_mode=config.torch_compile_mode,