|  | 
| 4 | 4 |   \frametitle{Profiling} | 
| 5 | 5 |   \begin{block}{Conceptually} | 
| 6 | 6 |     \begin{itemize} | 
| 7 |  | -      \item take a measurement of a performance aspect of a program | 
|  | 7 | +      \item Take a measurement of a performance aspect of a program | 
| 8 | 8 |       \begin{itemize} | 
| 9 |  | -        \item where in my code is most of the time spent? | 
| 10 |  | -        \item is my program compute or memory bound? | 
| 11 |  | -        \item does my program make good use of the cache? | 
| 12 |  | -        \item is my program using all cores most of the time? | 
| 13 |  | -        \item how often are threads blocked and why? | 
| 14 |  | -        \item which API calls are made and in which order? | 
|  | 9 | +        \item Where in my code is most of the time spent? | 
|  | 10 | +        \item Is my program compute or memory bound? | 
|  | 11 | +        \item Does my program make good use of the cache? | 
|  | 12 | +        \item Is my program using all cores most of the time? | 
|  | 13 | +        \item How often are threads blocked and why? | 
|  | 14 | +        \item Which API calls are made and in which order? | 
| 15 | 15 |         \item ... | 
| 16 | 16 |       \end{itemize} | 
| 17 |  | -      \item the goal is to find performance bottlenecks | 
| 18 |  | -      \item is usually done on a compiled program, not on source code | 
|  | 17 | +      \item The goal is to find performance bottlenecks | 
|  | 18 | +      \item Usually done on a compiled program, not on source code | 
| 19 | 19 |     \end{itemize} | 
| 20 | 20 |   \end{block} | 
| 21 | 21 | \end{frame} | 
| 22 | 22 | 
 | 
| 23 | 23 | \begin{frame}[fragile] | 
| 24 |  | -  \frametitle{perf, VTune and uProf} | 
| 25 |  | -  \begin{block}{perf} | 
|  | 24 | +  \frametitle{\mintinline{bash}{perf} -- Performance analysis tools for Linux} | 
|  | 25 | +  \setlength{\leftmargini}{0pt} | 
| 26 | 26 |     \begin{itemize} | 
| 27 |  | -      \item perf is a powerful command line profiling tool for linux | 
| 28 |  | -      \item compile with \mintinline{bash}{-g -fno-omit-frame-pointer} | 
| 29 |  | -      \item \mintinline{bash}{perf stat -d <prg>} gathers performance statistics while running \mintinline{bash}{<prg>} | 
| 30 |  | -      \item \mintinline{bash}{perf record -g <prg>} starts profiling \mintinline{bash}{<prg>} | 
| 31 |  | -      \item \mintinline{bash}{perf report} displays a report from the last profile | 
| 32 |  | -      \item More information in \href{https://perf.wiki.kernel.org/index.php/Main_Page}{this wiki}, \href{https://www.brendangregg.com/linuxperf.html}{this website} or \href{https://indico.cern.ch/event/980497/contributions/4130271/attachments/2161581/3647235/linux-systems-performance.pdf}{this talk}. | 
|  | 27 | +      \item Powerful command line profiling tool for Linux | 
|  | 28 | +      \item Not portable, the source code is part of the Linux kernel itself | 
|  | 29 | +      \item Much lower overhead compared with \mintinline{bash}{valgrind} | 
|  | 30 | +      \item To use it, compile your code with \mintinline{bash}{-g -fno-omit-frame-pointer} | 
|  | 31 | +      \item Counting and sampling | 
|  | 32 | +        \begin{itemize} | 
|  | 33 | +          \item Counting -- count occurrences of a given event (e.g.\ cache misses) | 
|  | 34 | +          \item Time-based sampling -- sample the stack at regular time intervals | 
|  | 35 | +          \item Event-based sampling -- take samples when event counter overflows | 
|  | 36 | +          \item Instruction-based sampling -- sample instructions and precisely count events they create | 
|  | 37 | +        \end{itemize} | 
|  | 38 | +      \item Static and dynamic tracing | 
|  | 39 | +        \begin{itemize} | 
|  | 40 | +          \item Static -- pre-defined tracepoints in software (e.g.\ scheduling events) | 
|  | 41 | +          \item Dynamic -- tracepoints created dynamically with \mintinline{bash}{perf probe} | 
|  | 42 | +        \end{itemize} | 
| 33 | 43 |     \end{itemize} | 
| 34 |  | -  \end{block} | 
| 35 |  | -  \begin{block}{Intel VTune and AMD uProf} | 
| 36 |  | -    \begin{itemize} | 
| 37 |  | -      \item Graphical profilers from CPU vendors with rich features | 
| 38 |  | -      \item Needs vendor's CPU for full experience | 
| 39 |  | -      \item More information on \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html}{Intel's website} and \href{https://developer.amd.com/amd-uprof/}{AMD's website} | 
| 40 |  | -    \end{itemize} | 
| 41 |  | -  \end{block} | 
|  | 44 | +\end{frame} | 
|  | 45 | + | 
|  | 46 | +\begin{frame}[fragile] | 
|  | 47 | +  \frametitle{\mintinline{bash}{perf} commands} | 
|  | 48 | +  { \scriptsize | 
|  | 49 | +    \begin{block}{} | 
|  | 50 | +      \begin{minted}{shell-session} | 
|  | 51 | +$ perf | 
|  | 52 | + usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS] | 
|  | 53 | + The most commonly used perf commands are: | 
|  | 54 | +   annotate        Read perf.data and display annotated code | 
|  | 55 | +   c2c             Shared Data C2C/HITM Analyzer. | 
|  | 56 | +   config          Get and set variables in a configuration file. | 
|  | 57 | +   diff            Read perf.data and display the differential profile | 
|  | 58 | +   evlist          List the event names in a perf.data file | 
|  | 59 | +   list            List all symbolic event types | 
|  | 60 | +   mem             Profile memory accesses | 
|  | 61 | +   record          Run a command and record its profile into perf.data | 
|  | 62 | +   report          Read perf.data and display the profile | 
|  | 63 | +   sched           Tool to trace/measure scheduler properties (latencies) | 
|  | 64 | +   script          Read perf.data and display trace output | 
|  | 65 | +   stat            Run command and gather performance counter statistics | 
|  | 66 | +   top             System profiling tool. | 
|  | 67 | +   version         display the version of perf binary | 
|  | 68 | +   probe           Define new dynamic tracepoints | 
|  | 69 | +   trace           strace inspired tool | 
|  | 70 | + See 'perf help COMMAND' for more information on a specific command. | 
|  | 71 | +      \end{minted} | 
|  | 72 | +    \end{block} | 
|  | 73 | +  } | 
|  | 74 | +\end{frame} | 
|  | 75 | +
 | 
|  | 76 | +\begin{frame}[fragile] | 
|  | 77 | +  \frametitle{Listing events with \mintinline{bash}{perf list}} | 
|  | 78 | +  { \scriptsize | 
|  | 79 | +    \begin{block}{} | 
|  | 80 | +      \begin{minted}{shell-session} | 
|  | 81 | +$ # List main hardware events | 
|  | 82 | +$ perf list hw | 
|  | 83 | +
 | 
|  | 84 | +List of pre-defined events (to be used in -e): | 
|  | 85 | +
 | 
|  | 86 | +  branch-instructions OR branches                    [Hardware event] | 
|  | 87 | +  branch-misses                                      [Hardware event] | 
|  | 88 | +  cache-misses                                       [Hardware event] | 
|  | 89 | +  cache-references                                   [Hardware event] | 
|  | 90 | +  cpu-cycles OR cycles                               [Hardware event] | 
|  | 91 | +  instructions                                       [Hardware event] | 
|  | 92 | +
 | 
|  | 93 | +$ # List main software/cache events | 
|  | 94 | +$ perf list sw | 
|  | 95 | +$ perf list cache | 
|  | 96 | +
 | 
|  | 97 | +$ # List all pre-defined metrics | 
|  | 98 | +$ perf list metric | 
|  | 99 | +
 | 
|  | 100 | +$ # List all currently known events: | 
|  | 101 | +$ perf list | 
|  | 102 | +      \end{minted} | 
|  | 103 | +    \end{block} | 
|  | 104 | +  } | 
|  | 105 | +\end{frame} | 
|  | 106 | +
 | 
|  | 107 | +\begin{frame}[fragile] | 
|  | 108 | +  \frametitle{Counting events with \mintinline{bash}{perf stat}} | 
|  | 109 | +  { \scriptsize | 
|  | 110 | +    \begin{block}{} | 
|  | 111 | +      \begin{minted}{shell-session} | 
|  | 112 | +$ # Standard CPU counter statistics for the specified command: | 
|  | 113 | +$ perf stat <command> | 
|  | 114 | +
 | 
|  | 115 | +$ # Detailed CPU counter statistics for the specified command: | 
|  | 116 | +$ perf stat -d <command> | 
|  | 117 | +$ perf stat -dd <command> | 
|  | 118 | +
 | 
|  | 119 | +$ # Top-down microarchitecture analysis for the entire system, for 10s: | 
|  | 120 | +$ perf stat -a --topdown -- sleep 10 | 
|  | 121 | +
 | 
|  | 122 | +$ # L1 cache hit rate reported every 1000 ms for the specified command: | 
|  | 123 | +$ perf stat -e L1-dcache-loads,L1-dcache-load-misses -I 1000 <command> | 
|  | 124 | +
 | 
|  | 125 | +$ # Instruction per cycle and Instruction-level parallelism, for command: | 
|  | 126 | +$ perf stat -M IPC,ILP -- <command> | 
|  | 127 | +
 | 
|  | 128 | +$ # Measure GFLOPs system-wide, until Ctrl-C is used to stop: | 
|  | 129 | +$ perf stat -M GFLOPs | 
|  | 130 | +
 | 
|  | 131 | +$ # Measure cycles and instructions 10 times, report results with stddev: | 
|  | 132 | +$ perf stat -e cycles,instructions -r 10 -- <command> | 
|  | 133 | +      \end{minted} | 
|  | 134 | +    \end{block} | 
|  | 135 | +  } | 
|  | 136 | +\end{frame} | 
|  | 137 | +
 | 
|  | 138 | +
 | 
|  | 139 | +\begin{frame}[fragile] | 
|  | 140 | +  \frametitle{Recording profiling information with \mintinline{bash}{perf record}} | 
|  | 141 | +  { \scriptsize | 
|  | 142 | +    \begin{block}{} | 
|  | 143 | +      \begin{minted}{shell-session} | 
|  | 144 | +$ # Sample on-CPU functions for the specified command, at 100 Hertz: | 
|  | 145 | +$ perf record -F 100 -- <command> | 
|  | 146 | +
 | 
|  | 147 | +$ # Sample CPU stack traces (via frame pointers), at 100 Hertz, for 10s: | 
|  | 148 | +$ perf record -F 100 -g -- sleep 10 | 
|  | 149 | +
 | 
|  | 150 | +$ # Sample stack traces for PID using DWARF to unwind stacks, for 10s: | 
|  | 151 | +$ perf record -p <PID> --call-graph=dwarf -- sleep 10 | 
|  | 152 | +
 | 
|  | 153 | +$ # Precise on-CPU user stack traces (no skid) using PEBS (Intel CPUs): | 
|  | 154 | +$ perf record -g -e cycles:up -- <command> | 
|  | 155 | +
 | 
|  | 156 | +$ # Sample CPU stack traces using Instruction-based sampling (AMD CPUs): | 
|  | 157 | +$ # (Note that you need to use system-wide sampling for IBS on AMD CPUs) | 
|  | 158 | +$ perf record -a -g -e cycles:pp -- <command> | 
|  | 159 | +
 | 
|  | 160 | +$ # Sample CPU stack traces once every 10k L1 data cache misses, for 5s: | 
|  | 161 | +$ perf record -a -g -e L1-dcache-load-misses -c 10000 -- sleep 5 | 
|  | 162 | +
 | 
|  | 163 | +$ # Sample CPUs at 100 Hertz, and show top addresses and symbols, live: | 
|  | 164 | +$ perf top -F 100 | 
|  | 165 | +      \end{minted} | 
|  | 166 | +    \end{block} | 
|  | 167 | +  } | 
|  | 168 | +\end{frame} | 
|  | 169 | +
 | 
|  | 170 | +\begin{frame}[fragile] | 
|  | 171 | +  \frametitle{Reporting and annotating source code with \mintinline{bash}{perf}} | 
|  | 172 | +  { \scriptsize | 
|  | 173 | +    \begin{block}{} | 
|  | 174 | +      \begin{minted}{shell-session} | 
|  | 175 | +$ # Standard reporting of perf.data in text UI interface: | 
|  | 176 | +$ perf report | 
|  | 177 | +
 | 
|  | 178 | +$ # Report by self-time (excluding time spent in callees): | 
|  | 179 | +$ perf report --no-children | 
|  | 180 | +
 | 
|  | 181 | +$ # Report per source line of code (needs debugging info to work): | 
|  | 182 | +$ perf report --no-children -s srcline | 
|  | 183 | +
 | 
|  | 184 | +$ # Single inverted (caller-based) call-graph per binary: | 
|  | 185 | +$ perf report --inverted -s comm | 
|  | 186 | +
 | 
|  | 187 | +$ # Text-based report per library, without call graph: | 
|  | 188 | +$ perf report --stdio -g none -s dso | 
|  | 189 | +
 | 
|  | 190 | +$ # Hierarchical report for functions taking at least 1% of runtime: | 
|  | 191 | +$ perf report --stdio -g none --hierarchy --percent-limit 1 | 
|  | 192 | +
 | 
|  | 193 | +$ # Disassemble and annotate a symbol (instructions with percentages): | 
|  | 194 | +$ # (Needs debugging information available to show source code as well) | 
|  | 195 | +$ perf annotate <symbol> | 
|  | 196 | +      \end{minted} | 
|  | 197 | +    \end{block} | 
|  | 198 | +  } | 
|  | 199 | +\end{frame} | 
|  | 200 | +
 | 
|  | 201 | +\begin{frame}[fragile] | 
|  | 202 | +  \frametitle{Further information on \mintinline{bash}{perf}} | 
|  | 203 | +  \begin{itemize} | 
|  | 204 | +    \item Official documentation in the Linux repository at | 
|  | 205 | +    \href{https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation} | 
|  | 206 | +         {linux/tools/perf/Documentation} | 
|  | 207 | +    \item Perf Wiki at \url{https://perf.wiki.kernel.org/} | 
|  | 208 | +    \item Linux \mintinline{bash}{perf} examples by Brendan Gregg | 
|  | 209 | +          \url{https://www.brendangregg.com/linuxperf.html} | 
|  | 210 | +    \item Scripts to visualize profiles as flamegraphs | 
|  | 211 | +          \url{https://github.com/brendangregg/FlameGraph} | 
|  | 212 | +    \item HSF Tools \& Packaging Working Group talk on Indico\\ | 
|  | 213 | +          \href{https://indico.cern.ch/event/974382/} | 
|  | 214 | +          {Linux Systems Performance: Tracing, Profiling \& Visualization} | 
|  | 215 | +  \end{itemize} | 
|  | 216 | +\end{frame} | 
|  | 217 | +
 | 
|  | 218 | +\begin{frame}[fragile] | 
|  | 219 | +  \frametitle{Intel VTune Profiler} | 
|  | 220 | +  \centering | 
|  | 221 | +  \includegraphics[width=0.75\textwidth]{tools/vtune.png} | 
|  | 222 | +  \begin{itemize} | 
|  | 223 | +    \item Very powerful GUI-based profiler for Intel CPUs and GPUs | 
|  | 224 | +    \item Now free to use with | 
|  | 225 | +      \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html}{Intel oneAPI Base Toolkit} or | 
|  | 226 | +      \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html}{standalone} | 
|  | 227 | +    \item See the \href{https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/} | 
|  | 228 | +                       {official online documentation} for more information | 
|  | 229 | +  \end{itemize} | 
| 42 | 230 | \end{frame} | 
0 commit comments