Add gpu_name as a parameter in roofline estimate utils (#2657)

chowarfb · facebook-github-bot · commit 73ff48f6905b · 2025-07-31T22:31:41.000-07:00
Summary: Pull Request resolved: #2657 See title, this lets us get estimates without needing to run on the hardware we're getting estimates for Differential Revision: D79415350
diff --git a/torchao/testing/training/roofline_utils.py b/torchao/testing/training/roofline_utils.py
@@ -65,8 +65,9 @@
 }
 
 
-def get_specs():
-    gpu_name = torch.cuda.get_device_name(0)
+def get_specs(gpu_name: Optional[str]=None):
+    if gpu_name is None:
+        gpu_name = torch.cuda.get_device_name(0)
     return gpu_name_to_specs[gpu_name]
 
 
@@ -213,10 +214,10 @@ def get_tensor_memory_traffic_ovhd_s(
 
 
 def get_individual_gemm_time_sympy(
-    M: sympy.Symbol, K: sympy.Symbol, N: sympy.Symbol, dtype, mx_recipe_name
+        M: sympy.Symbol, K: sympy.Symbol, N: sympy.Symbol, dtype, mx_recipe_name, gpu_name: Optional[str]=None
 ) -> sympy.Symbol:
     # compute bound
-    specs = get_specs()
+    specs = get_specs(gpu_name)
     gemm_ops = 2 * M * K * N
     if dtype is torch.bfloat16:
         peak_tops = specs["bf16_peak_tops"]
@@ -263,6 +264,7 @@ def get_gemm_time_sympy(
     dtype,
     float8_recipe_name: Optional[str],
     mx_recipe_name: Optional[str],
+    gpu_name: Optional[str],
 ):
     # next: add rowwise_with_gw_hp here
     # note: this function is currently not super accurate for small shapes:
@@ -277,13 +279,13 @@ def get_gemm_time_sympy(
         gemm_dtype_grad_weight = torch.bfloat16
 
     gemm_output_time_s = get_individual_gemm_time_sympy(
-        M, K, N, gemm_dtype_input, mx_recipe_name
+        M, K, N, gemm_dtype_input, mx_recipe_name, gpu_name
     )
     gemm_grad_input_time_s = get_individual_gemm_time_sympy(
-        M, N, K, gemm_dtype_grad_input, mx_recipe_name
+        M, N, K, gemm_dtype_grad_input, mx_recipe_name, gpu_name
     )
     gemm_grad_weight_time_s = get_individual_gemm_time_sympy(
-        K, M, N, gemm_dtype_grad_weight, mx_recipe_name
+        K, M, N, gemm_dtype_grad_weight, mx_recipe_name, gpu_name
     )
     total = gemm_output_time_s + gemm_grad_input_time_s + gemm_grad_weight_time_s
     return total
@@ -296,8 +298,9 @@ def get_float8_mem_sympy(
     float8_recipe_name: Optional[str],
     mx_recipe_name: Optional[str],
     enable_fusion_modeling: bool,
+    gpu_name: Optional[str]=None
 ):
-    specs = get_specs()
+    specs = get_specs(gpu_name)
 
     # there are three gemms in the fwd/bwd of a linear:
     #