|
1 | 1 | """
|
2 | 2 | PyTorch Profiler
|
3 | 3 | ====================================
|
| 4 | +**Author:** `Shivam Raikundalia <https://github.com/sraikund16>`_ |
| 5 | +""" |
| 6 | + |
| 7 | +###################################################################### |
| 8 | +""" |
4 | 9 | This recipe explains how to use PyTorch profiler and measure the time and
|
5 | 10 | memory consumption of the model's operators.
|
6 | 11 |
|
|
12 | 17 | In this recipe, we will use a simple Resnet model to demonstrate how to
|
13 | 18 | use profiler to analyze model performance.
|
14 | 19 |
|
| 20 | +Prerequisites |
| 21 | +--------------- |
| 22 | +- ``torch >= 1.9`` |
| 23 | +
|
15 | 24 | Setup
|
16 | 25 | -----
|
17 | 26 | To install ``torch`` and ``torchvision`` use the following command:
|
|
20 | 29 |
|
21 | 30 | pip install torch torchvision
|
22 | 31 |
|
23 |
| -
|
24 | 32 | """
|
25 | 33 |
|
26 |
| - |
27 | 34 | ######################################################################
|
28 | 35 | # Steps
|
29 | 36 | # -----
|
|
45 | 52 |
|
46 | 53 | import torch
|
47 | 54 | import torchvision.models as models
|
48 |
| -from torch.profiler import profile, record_function, ProfilerActivity |
| 55 | +from torch.profiler import profile, ProfilerActivity, record_function |
49 | 56 |
|
50 | 57 |
|
51 | 58 | ######################################################################
|
|
135 | 142 | # To get a finer granularity of results and include operator input shapes, pass ``group_by_input_shape=True``
|
136 | 143 | # (note: this requires running the profiler with ``record_shapes=True``):
|
137 | 144 |
|
138 |
| -print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10)) |
| 145 | +print( |
| 146 | + prof.key_averages(group_by_input_shape=True).table( |
| 147 | + sort_by="cpu_time_total", row_limit=10 |
| 148 | + ) |
| 149 | +) |
139 | 150 |
|
140 | 151 | ########################################################################################
|
141 | 152 | # The output might look like this (omitting some columns):
|
|
167 | 178 | # Users could switch between cpu, cuda and xpu
|
168 | 179 | activities = [ProfilerActivity.CPU]
|
169 | 180 | if torch.cuda.is_available():
|
170 |
| - device = 'cuda' |
| 181 | + device = "cuda" |
171 | 182 | activities += [ProfilerActivity.CUDA]
|
172 | 183 | elif torch.xpu.is_available():
|
173 |
| - device = 'xpu' |
| 184 | + device = "xpu" |
174 | 185 | activities += [ProfilerActivity.XPU]
|
175 | 186 | else:
|
176 |
| - print('Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices') |
| 187 | + print( |
| 188 | + "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices" |
| 189 | + ) |
177 | 190 | import sys
|
| 191 | + |
178 | 192 | sys.exit(0)
|
179 | 193 |
|
180 | 194 | sort_by_keyword = device + "_time_total"
|
|
256 | 270 | model = models.resnet18()
|
257 | 271 | inputs = torch.randn(5, 3, 224, 224)
|
258 | 272 |
|
259 |
| -with profile(activities=[ProfilerActivity.CPU], |
260 |
| - profile_memory=True, record_shapes=True) as prof: |
| 273 | +with profile( |
| 274 | + activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True |
| 275 | +) as prof: |
261 | 276 | model(inputs)
|
262 | 277 |
|
263 | 278 | print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
|
|
312 | 327 | # Users could switch between cpu, cuda and xpu
|
313 | 328 | activities = [ProfilerActivity.CPU]
|
314 | 329 | if torch.cuda.is_available():
|
315 |
| - device = 'cuda' |
| 330 | + device = "cuda" |
316 | 331 | activities += [ProfilerActivity.CUDA]
|
317 | 332 | elif torch.xpu.is_available():
|
318 |
| - device = 'xpu' |
| 333 | + device = "xpu" |
319 | 334 | activities += [ProfilerActivity.XPU]
|
320 | 335 | else:
|
321 |
| - print('Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices') |
| 336 | + print( |
| 337 | + "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices" |
| 338 | + ) |
322 | 339 | import sys
|
| 340 | + |
323 | 341 | sys.exit(0)
|
324 | 342 |
|
325 | 343 | model = models.resnet18().to(device)
|
|
347 | 365 | with profile(
|
348 | 366 | activities=activities,
|
349 | 367 | with_stack=True,
|
| 368 | + experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True), |
350 | 369 | ) as prof:
|
351 | 370 | model(inputs)
|
352 | 371 |
|
|
401 | 420 |
|
402 | 421 | from torch.profiler import schedule
|
403 | 422 |
|
404 |
| -my_schedule = schedule( |
405 |
| - skip_first=10, |
406 |
| - wait=5, |
407 |
| - warmup=1, |
408 |
| - active=3, |
409 |
| - repeat=2) |
| 423 | +my_schedule = schedule(skip_first=10, wait=5, warmup=1, active=3, repeat=2) |
410 | 424 |
|
411 | 425 | ######################################################################
|
412 | 426 | # Profiler assumes that the long-running job is composed of steps, numbered
|
|
444 | 458 |
|
445 | 459 | sort_by_keyword = "self_" + device + "_time_total"
|
446 | 460 |
|
| 461 | + |
447 | 462 | def trace_handler(p):
|
448 | 463 | output = p.key_averages().table(sort_by=sort_by_keyword, row_limit=10)
|
449 | 464 | print(output)
|
450 | 465 | p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
|
451 | 466 |
|
| 467 | + |
452 | 468 | with profile(
|
453 | 469 | activities=activities,
|
454 |
| - schedule=torch.profiler.schedule( |
455 |
| - wait=1, |
456 |
| - warmup=1, |
457 |
| - active=2), |
458 |
| - on_trace_ready=trace_handler |
| 470 | + schedule=torch.profiler.schedule(wait=1, warmup=1, active=2), |
| 471 | + on_trace_ready=trace_handler, |
459 | 472 | ) as p:
|
460 | 473 | for idx in range(8):
|
461 | 474 | model(inputs)
|
|
0 commit comments