Fix latency time measurement

astroC86 · astroC86 · commit 44ef929b2748 · 2025-09-06T20:04:01.000+02:00
diff --git a/benchmarks/bench_load_latency.py b/benchmarks/bench_load_latency.py
@@ -11,7 +11,19 @@
 import triton.language as tl
 import iris
 from iris._mpi_helpers import mpi_allgather
-from examples.common.utils import read_realtime
+# from examples.common.utils import read_realtime
+
+@triton.jit
+def read_realtime():
+    tmp = tl.inline_asm_elementwise(
+        asm="mov.u64 $0, %globaltimer;",
+        constraints=("=l"),
+        args=[],
+        dtype=tl.int64,
+        is_pure=False,
+        pack=1,
+    )
+    return tmp
 
 
 @triton.jit()
@@ -38,10 +50,10 @@ def load_remote(
         if i == skip:
             start = read_realtime()
             tl.store(mm_begin_timestamp_ptr + peer_rank * BLOCK_SIZE + offsets, start, time_stmp_mask)
-        
+
         # iris.load(data + offsets, curr_rank, peer_rank,heap_bases, data_mask)
         from_base = tl.load(heap_bases + curr_rank)
-        to_base   = tl.load(heap_bases + peer_rank)
+        to_base = tl.load(heap_bases + peer_rank)
         offset = tl.cast(data + offsets, tl.uint64) - from_base
         translated_ptr = tl.cast(tl.cast(to_base, tl.pointer_type(tl.int8)) + offset, (data + offsets).dtype)
         result = tl.load(translated_ptr, mask=data_mask, cache_modifier=".cv", volatile=True)
@@ -240,15 +252,14 @@ def print_run_settings(
     grid = lambda meta: (1,)
     for source_rank in range(num_ranks):
         for destination_rank in range(num_ranks):
-            if cur_rank in [source_rank, destination_rank]:
-                peer_for_me = destination_rank if cur_rank == source_rank else source_rank
+            if cur_rank == source_rank:
                 load_remote[grid](
                     source_buffer,
                     BUFFER_LEN,
                     skip,
                     niter,
                     cur_rank,
-                    peer_for_me,
+                    destination_rank,
                     BLOCK_SIZE,
                     heap_bases,
                     mm_begin_timestamp,
@@ -258,10 +269,12 @@ def print_run_settings(
 
     mm_begin_cpu = mm_begin_timestamp.cpu().numpy()
     mm_end_cpu = mm_end_timestamp.cpu().numpy()
+
+    gpu_freq  = iris.hip.get_wall_clock_rate(cur_rank) * 1e-3
     for destination_rank in range(num_ranks):
         delta = mm_end_cpu[destination_rank, :] - mm_begin_cpu[destination_rank, :]
-        avg_ns = float(delta.sum() / max(1, delta.size) / max(1, niter))
-        local_latency[destination_rank] = avg_ns
+        avg_cc = float(delta.sum() / max(1, delta.size) / max(1, niter))
+        local_latency[destination_rank] = avg_cc / gpu_freq
 
     latency_matrix = mpi_allgather(local_latency.cpu())