Skip to content

Commit 1509fd5

Browse files
committed
Fix latency time measurement
1 parent 44ef929 commit 1509fd5

File tree

1 file changed

+2
-15
lines changed

1 file changed

+2
-15
lines changed

benchmarks/bench_load_latency.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,7 @@
1111
import triton.language as tl
1212
import iris
1313
from iris._mpi_helpers import mpi_allgather
14-
# from examples.common.utils import read_realtime
15-
16-
@triton.jit
17-
def read_realtime():
18-
tmp = tl.inline_asm_elementwise(
19-
asm="mov.u64 $0, %globaltimer;",
20-
constraints=("=l"),
21-
args=[],
22-
dtype=tl.int64,
23-
is_pure=False,
24-
pack=1,
25-
)
26-
return tmp
14+
from examples.common.utils import read_realtime
2715

2816

2917
@triton.jit()
@@ -270,11 +258,10 @@ def print_run_settings(
270258
mm_begin_cpu = mm_begin_timestamp.cpu().numpy()
271259
mm_end_cpu = mm_end_timestamp.cpu().numpy()
272260

273-
gpu_freq = iris.hip.get_wall_clock_rate(cur_rank) * 1e-3
274261
for destination_rank in range(num_ranks):
275262
delta = mm_end_cpu[destination_rank, :] - mm_begin_cpu[destination_rank, :]
276263
avg_cc = float(delta.sum() / max(1, delta.size) / max(1, niter))
277-
local_latency[destination_rank] = avg_cc / gpu_freq
264+
local_latency[destination_rank] = avg_cc * 10 # since the value is updated every 10ns (cc freq is 100MHZ in MI300)
278265

279266
latency_matrix = mpi_allgather(local_latency.cpu())
280267

0 commit comments

Comments
 (0)