Disable a flaky test (#257)

jerryzh168 · msaroufim · web-flow · commit adfe57049028 · 2024-05-20T12:11:14.000-07:00
Summary:
att

Test Plan:
python test/integration/test_integration.py

Reviewers:

Subscribers:

Tasks:

Tags:

Co-authored-by: Mark Saroufim &lt;marksaroufim@meta.com&gt;
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -1104,6 +1104,7 @@ def test_weight_only_quant(self):
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @torch.no_grad()
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skip("This test is flaky, we'll enable later")
     def test_weight_only_quant_force_mixed_mm(self, device, dtype):
         if device != "cuda":
             self.skipTest(f"weight_only_quant_force_mixed_mm can't be constructed on {device}")
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -563,7 +563,7 @@ def get_per_token_block_size(x):
         input_eps = 1e-5
         input_quant_min = -127
         input_quant_max = 127
-        input_quant_func = lambda x: AffineQuantizedTensor.from_float(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype, eps=input_eps, quant_min=input_quant_min, quant_max=input_quant_max, scale_dtype=torch.float)
+        input_quant_func = lambda x: AffineQuantizedTensor.from_float(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype, eps=input_eps, quant_min=input_quant_min, quant_max=input_quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
 
         # use 1024 so that we don't need padding
         m = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")