Fix skipped tests for test_model_parallel_gloo (#3196)

jeffkbkim · facebook-github-bot · commit 0d6ef907f9b5 · 2025-07-16T09:05:32.000-07:00
Summary: Pull Request resolved: #3196 `ModelParallelStateDictTestGloo: test_optimizer_load_state_dict` test is frequently getting skipped because some of the examples generated by the framework hits the skipTest() condition, which is: - Using CPU with UVM Kernel modes (FUSED_UVM, FUSED_UVM_CACHING) While iterating through each generated example, the test will consider the entire test "skipped" if any of them hit the skipTest condition. Instead, we should just skip the example so that hypothesis can generate the next example which is valid. Reviewed By: jd7-tr Differential Revision: D78355780 fbshipit-source-id: 8ad17b3953e3b1cb2bffcf4e25d6e0537410b66c
diff --git a/torchrec/distributed/test_utils/test_model_parallel_base.py b/torchrec/distributed/test_utils/test_model_parallel_base.py
@@ -19,7 +19,7 @@
 from fbgemm_gpu.tbe.ssd.utils.partially_materialized_tensor import (
     PartiallyMaterializedTensor,
 )
-from hypothesis import given, settings, strategies as st, Verbosity
+from hypothesis import assume, given, settings, strategies as st, Verbosity
 from torch import distributed as dist
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._tensor import DTensor
@@ -624,11 +624,10 @@ def test_load_state_dict(
         kernel_type: str,
         is_training: bool,
     ) -> None:
-        if (
-            self.device == torch.device("cpu")
-            and kernel_type != EmbeddingComputeKernel.FUSED.value
-        ):
-            self.skipTest("CPU does not support uvm.")
+        assume(
+            self.device != torch.device("cpu")
+            or kernel_type == EmbeddingComputeKernel.FUSED.value
+        )
 
         sharders = [
             cast(
@@ -683,11 +682,10 @@ def test_optimizer_load_state_dict(
         sharding_type: str,
         kernel_type: str,
     ) -> None:
-        if (
-            self.device == torch.device("cpu")
-            and kernel_type != EmbeddingComputeKernel.FUSED.value
-        ):
-            self.skipTest("CPU does not support uvm.")
+        assume(
+            self.device != torch.device("cpu")
+            or kernel_type == EmbeddingComputeKernel.FUSED.value
+        )
 
         sharders = [
             cast(
@@ -800,11 +798,10 @@ def test_load_state_dict_dp(
     def test_load_state_dict_prefix(
         self, sharder_type: str, sharding_type: str, kernel_type: str, is_training: bool
     ) -> None:
-        if (
-            self.device == torch.device("cpu")
-            and kernel_type != EmbeddingComputeKernel.FUSED.value
-        ):
-            self.skipTest("CPU does not support uvm.")
+        assume(
+            self.device != torch.device("cpu")
+            or kernel_type == EmbeddingComputeKernel.FUSED.value
+        )
 
         sharders = [
             cast(
@@ -855,11 +852,10 @@ def test_load_state_dict_prefix(
     def test_params_and_buffers(
         self, sharder_type: str, sharding_type: str, kernel_type: str
     ) -> None:
-        if (
-            self.device == torch.device("cpu")
-            and kernel_type != EmbeddingComputeKernel.FUSED.value
-        ):
-            self.skipTest("CPU does not support uvm.")
+        assume(
+            self.device != torch.device("cpu")
+            or kernel_type == EmbeddingComputeKernel.FUSED.value
+        )
 
         sharders = [
             create_test_sharder(sharder_type, sharding_type, kernel_type),
@@ -897,11 +893,10 @@ def test_params_and_buffers(
     def test_load_state_dict_cw_multiple_shards(
         self, sharder_type: str, sharding_type: str, kernel_type: str, is_training: bool
     ) -> None:
-        if (
-            self.device == torch.device("cpu")
-            and kernel_type != EmbeddingComputeKernel.FUSED.value
-        ):
-            self.skipTest("CPU does not support uvm.")
+        assume(
+            self.device != torch.device("cpu")
+            or kernel_type == EmbeddingComputeKernel.FUSED.value
+        )
 
         sharders = [
             cast(