Apply Ruff auto-fixes

github-actions[bot] · github-actions[bot] · commit 53f5eb7641ee · 2025-08-27T07:01:43.000Z
diff --git a/iris/cuda.py b/iris/cuda.py
@@ -82,7 +82,9 @@ def get_cu_count(device_id=None):
     cudaDeviceAttributeMultiprocessorCount = 16
     cu_count = ctypes.c_int()
 
-    cuda_try(cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDeviceAttributeMultiprocessorCount, device_id))
+    cuda_try(
+        cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDeviceAttributeMultiprocessorCount, device_id)
+    )
 
     return cu_count.value
 
@@ -107,9 +109,7 @@ def get_cu_count(device_id=None):
 def get_wall_clock_rate(device_id):
     cudaDevAttrMemoryClockRate = 36
     wall_clock_rate = ctypes.c_int()
-    status = cuda_runtime.cudaDeviceGetAttribute(
-        ctypes.byref(wall_clock_rate), cudaDevAttrMemoryClockRate, device_id
-    )
+    status = cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(wall_clock_rate), cudaDevAttrMemoryClockRate, device_id)
     cuda_try(status)
     return wall_clock_rate.value
 
diff --git a/tests/examples/test_load_latency.py b/tests/examples/test_load_latency.py
@@ -11,6 +11,7 @@
 from mpi4py import MPI
 # from examples.common.utils import read_realtime
 
+
 @triton.jit
 def read_realtime():
     tmp = tl.inline_asm_elementwise(
@@ -23,21 +24,25 @@ def read_realtime():
     )
     return tmp
 
+
 @triton.jit()
 def gather_latencies(
-    local_latency,
-    global_latency,
-    curr_rank,
-    num_ranks,
-    BLOCK_SIZE: tl.constexpr,
-    heap_bases: tl.tensor
+    local_latency, global_latency, curr_rank, num_ranks, BLOCK_SIZE: tl.constexpr, heap_bases: tl.tensor
 ):
     pid = tl.program_id(0)
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
 
     latency_mask = offsets < num_ranks
-    iris.put(local_latency + offsets, global_latency +  curr_rank * num_ranks + offsets, curr_rank, 0, heap_bases, mask=latency_mask)
+    iris.put(
+        local_latency + offsets,
+        global_latency + curr_rank * num_ranks + offsets,
+        curr_rank,
+        0,
+        heap_bases,
+        mask=latency_mask,
+    )
+
 
 @triton.jit()
 def ping_pong(
@@ -100,9 +105,9 @@ def ping_pong(
 #     ],
 # )
 
-#def test_load_bench(dtype, heap_size):
+# def test_load_bench(dtype, heap_size):
 if __name__ == "__main__":
-    dtype     = torch.int32
+    dtype = torch.int32
     heap_size = 1 << 32
     shmem = iris.iris(heap_size)
     num_ranks = shmem.get_num_ranks()
@@ -115,36 +120,42 @@ def ping_pong(
     iter = 1
     skip = 1
     mm_begin_timestamp = torch.zeros((num_ranks, BLOCK_SIZE), dtype=torch.int64, device="cuda")
-    mm_end_timestamp   = torch.zeros((num_ranks, BLOCK_SIZE), dtype=torch.int64, device="cuda")
-
-    local_latency      = torch.zeros((num_ranks), dtype=torch.float32, device="cuda")
-    latency_matrix     = shmem.zeros((num_ranks, num_ranks), dtype=torch.float32,device="cuda")
+    mm_end_timestamp = torch.zeros((num_ranks, BLOCK_SIZE), dtype=torch.int64, device="cuda")
 
+    local_latency = torch.zeros((num_ranks), dtype=torch.float32, device="cuda")
+    latency_matrix = shmem.zeros((num_ranks, num_ranks), dtype=torch.float32, device="cuda")
 
     source_buffer = shmem.ones(BUFFER_LEN, dtype=dtype)
     result_buffer = shmem.zeros_like(source_buffer)
-    flag          = shmem.ones(1, dtype=dtype)
+    flag = shmem.ones(1, dtype=dtype)
 
     grid = lambda meta: (1,)
     for source_rank in range(num_ranks):
         for destination_rank in range(num_ranks):
             if source_rank != destination_rank and cur_rank in [source_rank, destination_rank]:
                 print(source_rank, destination_rank)
-                ping_pong[grid](source_buffer, 
-                                result_buffer, BUFFER_LEN, 
-                                skip, iter, 
-                                flag, 
-                                source_rank, destination_rank, 
-                                BLOCK_SIZE, 
-                                heap_bases, 
-                                mm_begin_timestamp, 
-                                mm_end_timestamp)
+                ping_pong[grid](
+                    source_buffer,
+                    result_buffer,
+                    BUFFER_LEN,
+                    skip,
+                    iter,
+                    flag,
+                    source_rank,
+                    destination_rank,
+                    BLOCK_SIZE,
+                    heap_bases,
+                    mm_begin_timestamp,
+                    mm_end_timestamp,
+                )
             shmem.barrier()
             torch.cuda.synchronize()
             MPI.COMM_WORLD.Barrier()
 
     for destination_rank in range(num_ranks):
-        local_latency[destination_rank] = (mm_end_timestamp.cpu()[destination_rank] - mm_begin_timestamp.cpu()[destination_rank]) / iter
+        local_latency[destination_rank] = (
+            mm_end_timestamp.cpu()[destination_rank] - mm_begin_timestamp.cpu()[destination_rank]
+        ) / iter
 
     # gather_latencies[grid](local_latency, latency_matrix, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     # shmem.barrier()
@@ -160,7 +171,6 @@ def ping_pong(
     #             line = f"R{i}\t" + "\t".join(row_entries) + "\n"
     #             f.write(line)
 
-
     # if cur_rank == 0:
     #     print("\nLatency measurements (raw timer ticks and per-iteration average):")
     #     # for i in range(num_ranks):