AI-Hypercomputer
diff --git a/‎tests/multimodal_test_utils.py‎
Lines changed: 20 additions & 7 deletions b/‎tests/multimodal_test_utils.py‎
Lines changed: 20 additions & 7 deletions
@@ -4,7 +4,6 @@
 import numpy as np
 import torch
 
-
 def create_random_jax_torch(*shape, dtype=np.float32):
     """Create random array and return both JAX and PyTorch versions.
 
@@ -19,6 +18,26 @@ def create_random_jax_torch(*shape, dtype=np.float32):
     return jnp.array(np_array), torch.from_numpy(np_array)
 
 
+def split_into_patches(x, temporal_patch_size, patch_size):
+    """Split a 5D tensor into patches for PyTorch vision encoder input.
+
+    Converts from full image format (batch, channels, temporal, height, width) to
+    patch format (num_patches, channels, temporal_patch_size, patch_size, patch_size).
+
+    Returns:
+        Tensor of shape (num_patches, channels, temporal_patch_size, patch_size, patch_size)
+        where num_patches = (temporal//temporal_patch_size) * (height//patch_size) * (width//patch_size)
+    """
+    B, C, T, H, W = x.shape
+    assert T % temporal_patch_size == 0, f"Temporal dimension {T} must be divisible by {temporal_patch_size}"
+    assert H % patch_size == 0, f"Height {H} must be divisible by {patch_size}"
+    assert W % patch_size == 0, f"Width {W} must be divisible by {patch_size}"
+
+    x = x.reshape(B, C, T, H // patch_size, patch_size, W // patch_size, patch_size)
+    x = x.permute(0, 3, 5, 1, 2, 4, 6)  # (B, H//patch_size, W//patch_size, C, T, patch_size, patch_size)
+    return x.reshape(-1, C, T, patch_size, patch_size)
+
+
 def assert_all_close_jax_torch(jax_tensor, torch_tensor, rtol, atol, error_msg=""):
     """Compare JAX and PyTorch tensors for numerical closeness.
 
@@ -295,8 +314,6 @@ def copy_maxtext_encoder_weights(torch_encoder, maxtext_encoder):
 
 
 # Vision-specific weight copying utilities
-
-
 def copy_conv3d_weights(torch_conv, jax_conv):
     """Copy weights from PyTorch Conv3d to JAX nnx.Conv (3D)."""
     # PyTorch Conv3d: (out_channels, in_channels, kD, kH, kW)
@@ -332,8 +349,6 @@ def copy_vision_encoder_weights(torch_encoder, jax_encoder):
         torch_encoder: PyTorch Qwen3OmniMoeVisionEncoder
         jax_encoder: JAX Qwen3OmniMoeVisionEncoder
     """
-    import jax.numpy as jnp
-
     # Copy patch embedding
     copy_patch_embed_weights(torch_encoder.patch_embed, jax_encoder.patch_embed)
 
@@ -362,8 +377,6 @@ def copy_vision_encoder_weights(torch_encoder, jax_encoder):
 
 
 # Audio-specific utilities
-
-
 def create_block_diagonal_attention_mask(
     cu_seqlens, dtype
 ):