pytorch
diff --git a/‎.github/workflows/doc_build.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/doc_build.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 7 additions & 1 deletion b/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎test/dtypes/test_affine_quantized_tensor_parallel.py‎
Lines changed: 1 addition & 1 deletion b/‎test/dtypes/test_affine_quantized_tensor_parallel.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/prototype/mx_formats/test_inference_workflow.py‎
Lines changed: 27 additions & 1 deletion b/‎test/prototype/mx_formats/test_inference_workflow.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎test/quantization/pt2e/test_quantize_pt2e_qat.py‎
Lines changed: 0 additions & 6 deletions b/‎test/quantization/pt2e/test_quantize_pt2e_qat.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎test/test_ops.py‎
Lines changed: 93 additions & 0 deletions b/‎test/test_ops.py‎
Lines changed: 93 additions & 0 deletions
@@ -43,6 +43,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install torch
+          python -m pip install setuptools==78.1.1 --force-reinstall
           python -m pip install -e .
           pip install -r dev-requirements.txt
           python -m pip install -r docs/requirements.txt
 
@@ -245,8 +245,12 @@ def run(
     bf16_gemm_time_sympy = get_gemm_time_sympy(
         M, K, N, torch.bfloat16, None, None, None
     )
+    lowp_input_dtype = torch.float8_e4m3fn
+    if mx_recipe_name == "mxfp4_cutlass":
+        lowp_input_dtype = torch.float4_e2m1fn_x2
+
     fp8_gemm_time_sympy = get_gemm_time_sympy(
-        M, K, N, torch.float8_e4m3fn, float8_recipe_name, mx_recipe_name, None
+        M, K, N, lowp_input_dtype, float8_recipe_name, mx_recipe_name, None
     )
     print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
     print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)
@@ -304,6 +308,8 @@ def run(
         rb_fp8_gemm_ratio = -1
 
         if do_benchmarks:
+            assert mx_recipe_name != "mxfp4_cutlass", "unsupported"
+
             # TODO(future): make the bf16 gemm times exactly match the e2e
             # benchmarks, there is a slight deviation, probably related to gemm
             # operand memory formats/transpositions below not exactly matching
 
@@ -115,7 +115,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         dn_quant(up_quant(example_input))
 
         mesh = self.build_device_mesh()
-        mesh.device_type = "cuda"
+        mesh._device_type = "cuda"
 
         # Shard the models
         up_dist = self.colwise_shard(up_quant, mesh)
 
@@ -6,10 +6,12 @@
 
 import copy
 import tempfile
+from contextlib import contextmanager
 
 import pytest
 import torch
 import torch.nn as nn
+from torch.profiler import ProfilerActivity, profile
 
 from torchao.prototype.mx_formats.config import (
     MXGemmKernelChoice,
@@ -44,6 +46,23 @@ def run_around_tests():
     torch._dynamo.reset()
 
 
+@contextmanager
+def cuda_kernel_profiler(kernel_pattern):
+    """Context manager for profiling CUDA kernels."""
+    result = {"found": False, "kernel_names": []}
+
+    with profile(activities=[ProfilerActivity.CUDA]) as prof:
+        yield result
+
+    kernel_names = [
+        evt.name
+        for evt in prof.events()
+        if evt.device_type == torch.autograd.DeviceType.CUDA and evt.name
+    ]
+    result["kernel_names"] = kernel_names
+    result["found"] = any(kernel_pattern in name for name in kernel_names)
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+"
@@ -178,7 +197,14 @@ def test_inference_workflow_nvfp4(
 
     x = torch.randn(batch_size, in_features, device="cuda", dtype=inpt_dtype)
     y_ref = m(x)
-    y_mx = m_mx(x)
+
+    if use_triton_kernel and mm_config != NVFP4MMConfig.WEIGHT_ONLY:
+        with cuda_kernel_profiler("quantize_nvfp4_triton_kernel") as result:
+            y_mx = m_mx(x)
+        assert result["found"], "Expected quantize_nvfp4 kernel to be found"
+    else:
+        y_mx = m_mx(x)
+
     sqnr = compute_error(y_ref, y_mx)
 
     if mm_config == NVFP4MMConfig.WEIGHT_ONLY:
 
@@ -686,12 +686,6 @@ def get_source_fn(node: torch.fx.Node):
         self.assertNotEqual(get_source_fn(second_conv), get_source_fn(second_relu))
         self.assertNotEqual(get_source_fn(first_relu), get_source_fn(second_relu))
 
-        # Assert that "backbone" exists only in the second set of conv and relu's partition
-        self.assertTrue("backbone" not in get_source_fn(first_conv))
-        self.assertTrue("backbone" not in get_source_fn(first_relu))
-        self.assertTrue("backbone" in get_source_fn(second_conv))
-        self.assertTrue("backbone" in get_source_fn(second_relu))
-
     def test_qat_conv_bn_bias_derived_qspec(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
 
@@ -40,7 +40,14 @@
 except RuntimeError:
     pytest.skip("torchao.ops not available")
 
+from torchao.quantization import PerGroup, PerRow, PerTensor
+from torchao.quantization.quant_primitives import (
+    _choose_scale_float8,
+    _dequantize_affine_float8,
+    _quantize_affine_float8,
+)
 from torchao.quantization.utils import (
+    get_block_size,
     get_groupwise_affine_qparams,
     groupwise_affine_dequantize_tensor_from_qparams,
     groupwise_affine_quantize_tensor_from_qparams,
@@ -901,5 +908,91 @@ def _test_scaled_embedding_bag_cpu_helper(
         torch.testing.assert_close(refe_out, test_out, atol=1e-5, rtol=1e-5)
 
 
+@pytest.mark.skipif(
+    "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
+    reason="cpp kernels not built",
+)
+@pytest.mark.parametrize(
+    "multi_hot, batch_size, vector_size, index_type",
+    EMBEDINGBAG_TEST_PARAMS,
+    ids=str,
+)
+def test_scaled_embedding_bag_int8_cpu(multi_hot, batch_size, vector_size, index_type):
+    _test_scaled_embedding_bag_cpu_helper(
+        multi_hot, batch_size, vector_size, index_type, torch.int8
+    )
+
+
+@pytest.mark.skipif(
+    "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
+    reason="cpp kernels not built",
+)
+@pytest.mark.parametrize(
+    "multi_hot, batch_size, vector_size, index_type",
+    EMBEDINGBAG_TEST_PARAMS,
+    ids=str,
+)
+def test_scaled_embedding_bag_fp8_cpu(multi_hot, batch_size, vector_size, index_type):
+    _test_scaled_embedding_bag_cpu_helper(
+        multi_hot, batch_size, vector_size, index_type, torch.float8_e4m3fn
+    )
+
+
+@pytest.mark.skipif(
+    "CPU" not in torch._C._dispatch_dump("torchao::float8_linear_prepack_cpu")
+    or "CPU" not in torch._C._dispatch_dump("torchao::float8_linear_cpu"),
+    reason="cpp kernels not built",
+)
+@pytest.mark.skipif(
+    not torch_version_at_least("2.6.0"), reason="Test only enabled for 2.6+"
+)
+@pytest.mark.parametrize("shape", [(64, 64), (256, 256)])
+@pytest.mark.parametrize("bs", [1, 160])
+@pytest.mark.parametrize("out_dtype", [torch.float, torch.bfloat16, torch.half])
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("x_granularity", [PerTensor(), PerRow(), PerGroup(128)])
+@pytest.mark.parametrize("w_granularity", [PerTensor(), PerRow(), PerGroup(128)])
+def test_float8_linear_cpu(shape, bs, out_dtype, bias, x_granularity, w_granularity):
+    in_feature, out_feature = shape
+    if isinstance(x_granularity, PerGroup):
+        if x_granularity.group_size >= in_feature:
+            return
+        if not isinstance(w_granularity, PerGroup):
+            return
+    if isinstance(w_granularity, PerGroup):
+        if w_granularity.group_size >= in_feature:
+            return
+    m = torch.nn.Linear(in_feature, out_feature, bias=bias).eval()
+    b = m.bias
+    x = torch.randn(bs, in_feature)
+    x_block_size = get_block_size(x.shape, x_granularity)
+    x_scale = _choose_scale_float8(
+        x,
+        float8_dtype=torch.float8_e4m3fn,
+        block_size=x_block_size,
+    )
+    x_fp8 = _quantize_affine_float8(x, x_scale, torch.float8_e4m3fn)
+
+    w = m.weight.detach()
+    w_block_size = get_block_size(w.shape, w_granularity)
+    w_scale = _choose_scale_float8(
+        w,
+        float8_dtype=torch.float8_e4m3fn,
+        block_size=w_block_size,
+    )
+    w_fp8 = _quantize_affine_float8(w, w_scale, torch.float8_e4m3fn)
+
+    x_dq = _dequantize_affine_float8(x_fp8, x_scale)
+    w_dq = _dequantize_affine_float8(w_fp8, w_scale)
+    ref = torch.nn.functional.linear(x_dq, w_dq, b).to(out_dtype)
+
+    packed_w, packed_scale = torch.ops.torchao.float8_linear_prepack_cpu(w_fp8, w_scale)
+    y = torch.ops.torchao.float8_linear_cpu(
+        x_fp8, x_scale, packed_w, packed_scale, b, out_dtype
+    )
+
+    torch.testing.assert_close(y, ref, atol=1e-2, rtol=1e-2)
+
+
 if __name__ == "__main__":
     pytest.main(sys.argv)