profiler recipe update (#3435)

sraikund16 · web-flow · commit 220b98b07821 · 2025-07-08T13:21:54.000-07:00
diff --git a/recipes_source/recipes/profiler_recipe.py b/recipes_source/recipes/profiler_recipe.py
@@ -1,6 +1,11 @@
 """
 PyTorch Profiler
 ====================================
+**Author:** `Shivam Raikundalia <https://github.com/sraikund16>`_
+"""
+
+######################################################################
+"""
 This recipe explains how to use PyTorch profiler and measure the time and
 memory consumption of the model's operators.
 
@@ -12,6 +17,10 @@
 In this recipe, we will use a simple Resnet model to demonstrate how to
 use profiler to analyze model performance.
 
+Prerequisites
+---------------
+- ``torch >= 1.9``
+
 Setup
 -----
 To install ``torch`` and ``torchvision`` use the following command:
@@ -20,10 +29,8 @@
 
    pip install torch torchvision
 
-
 """
 
-
 ######################################################################
 # Steps
 # -----
@@ -45,7 +52,7 @@
 
 import torch
 import torchvision.models as models
-from torch.profiler import profile, record_function, ProfilerActivity
+from torch.profiler import profile, ProfilerActivity, record_function
 
 
 ######################################################################
@@ -135,7 +142,11 @@
 # To get a finer granularity of results and include operator input shapes, pass ``group_by_input_shape=True``
 # (note: this requires running the profiler with ``record_shapes=True``):
 
-print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
+print(
+    prof.key_averages(group_by_input_shape=True).table(
+        sort_by="cpu_time_total", row_limit=10
+    )
+)
 
 ########################################################################################
 # The output might look like this (omitting some columns):
@@ -167,14 +178,17 @@
 # Users could switch between cpu, cuda and xpu
 activities = [ProfilerActivity.CPU]
 if torch.cuda.is_available():
-    device = 'cuda'
+    device = "cuda"
     activities += [ProfilerActivity.CUDA]
 elif torch.xpu.is_available():
-    device = 'xpu'
+    device = "xpu"
     activities += [ProfilerActivity.XPU]
 else:
-    print('Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices')
+    print(
+        "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices"
+    )
     import sys
+
     sys.exit(0)
 
 sort_by_keyword = device + "_time_total"
@@ -256,8 +270,9 @@
 model = models.resnet18()
 inputs = torch.randn(5, 3, 224, 224)
 
-with profile(activities=[ProfilerActivity.CPU],
-        profile_memory=True, record_shapes=True) as prof:
+with profile(
+    activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True
+) as prof:
     model(inputs)
 
 print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
@@ -312,14 +327,17 @@
 # Users could switch between cpu, cuda and xpu
 activities = [ProfilerActivity.CPU]
 if torch.cuda.is_available():
-    device = 'cuda'
+    device = "cuda"
     activities += [ProfilerActivity.CUDA]
 elif torch.xpu.is_available():
-    device = 'xpu'
+    device = "xpu"
     activities += [ProfilerActivity.XPU]
 else:
-    print('Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices')
+    print(
+        "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices"
+    )
     import sys
+
     sys.exit(0)
 
 model = models.resnet18().to(device)
@@ -347,6 +365,7 @@
 with profile(
     activities=activities,
     with_stack=True,
+    experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True),
 ) as prof:
     model(inputs)
 
@@ -401,12 +420,7 @@
 
 from torch.profiler import schedule
 
-my_schedule = schedule(
-    skip_first=10,
-    wait=5,
-    warmup=1,
-    active=3,
-    repeat=2)
+my_schedule = schedule(skip_first=10, wait=5, warmup=1, active=3, repeat=2)
 
 ######################################################################
 # Profiler assumes that the long-running job is composed of steps, numbered
@@ -444,18 +458,17 @@
 
 sort_by_keyword = "self_" + device + "_time_total"
 
+
 def trace_handler(p):
     output = p.key_averages().table(sort_by=sort_by_keyword, row_limit=10)
     print(output)
     p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
 
+
 with profile(
     activities=activities,
-    schedule=torch.profiler.schedule(
-        wait=1,
-        warmup=1,
-        active=2),
-    on_trace_ready=trace_handler
+    schedule=torch.profiler.schedule(wait=1, warmup=1, active=2),
+    on_trace_ready=trace_handler,
 ) as p:
     for idx in range(8):
         model(inputs)