Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e4dfaf4

Browse files
committedApr 1, 2025·
Add 08-tbb slides
1 parent 927aac5 commit e4dfaf4

File tree

3 files changed

+518
-0
lines changed

3 files changed

+518
-0
lines changed
 

‎08-tbb/08-tbb.tex‎

Lines changed: 511 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,511 @@
1+
\documentclass{beamer}
2+
3+
% Theme choice
4+
\usetheme{Madrid}
5+
6+
% Optional packages
7+
\usepackage{graphicx} % For including images
8+
\usepackage{amsmath} % For math symbols and formulas
9+
\usepackage{hyperref} % For hyperlinks
10+
\usepackage{tikz} % For charts
11+
\usepackage{listings}
12+
\usepackage{xcolor}
13+
\usepackage[T1]{fontenc}
14+
15+
\lstdefinestyle{CStyle}{
16+
language=C, % Set the language to C
17+
basicstyle=\ttfamily\footnotesize\linespread{0.9}\tiny, % Set font style and size
18+
keywordstyle=\color{blue}, % Color of keywords
19+
commentstyle=\color{gray}, % Color of comments
20+
stringstyle=\color{red}, % Color of strings
21+
showstringspaces=false, % Do not mark spaces in strings
22+
breaklines=true, % Enable line breaks at appropriate places
23+
breakatwhitespace=false, % Break lines at any character, not just whitespace
24+
numbers=left, % Show line numbers on the left
25+
numberstyle=\tiny\color{gray}, % Style for line numbers
26+
tabsize=4, % Set tab width
27+
keepspaces=true, % Keep indentation spaces
28+
frame=single, % Add a border around the code
29+
aboveskip=0pt, % Reduce space above the code block
30+
belowskip=0pt, % Reduce space below the code block
31+
xleftmargin=7.5pt, % Add left padding (approx. 2.8mm or 10px)
32+
xrightmargin=15pt, % Add left padding (approx. 2.8mm or 10px)
33+
}
34+
35+
% Title, author, date, and institute (optional)
36+
\title[Parallel Programming. TBB]{Parallel Programming course. TBB}
37+
\author{Obolenskiy Arseniy, Nesterov Alexander}
38+
\institute{Nizhny Novgorod State University}
39+
40+
\date{\today} % or \date{Month Day, Year}
41+
42+
% Redefine the footline to display both the short title and the university name
43+
\setbeamertemplate{footline}{
44+
\leavevmode%
45+
\hbox{%
46+
\begin{beamercolorbox}[wd=.45\paperwidth,ht=2.5ex,dp=1ex,leftskip=1em,center]{author in head/foot}%
47+
\usebeamerfont{author in head/foot}\insertshortinstitute % Displays the university name
48+
\end{beamercolorbox}%
49+
\begin{beamercolorbox}[wd=.45\paperwidth,ht=2.5ex,dp=1ex,leftskip=1em,center]{author in head/foot}%
50+
\usebeamerfont{author in head/foot}\insertshorttitle % Displays the short title
51+
\end{beamercolorbox}%
52+
\begin{beamercolorbox}[wd=.1\paperwidth,ht=2.5ex,dp=1ex,rightskip=1em,center]{author in head/foot}%
53+
\usebeamerfont{author in head/foot}\insertframenumber{} / \inserttotalframenumber
54+
\end{beamercolorbox}}%
55+
\vskip0pt%
56+
}
57+
58+
\begin{document}
59+
60+
% Title slide
61+
\begin{frame}
62+
\titlepage
63+
\end{frame}
64+
65+
% Table of Contents (optional)
66+
\begin{frame}{Contents}
67+
\tableofcontents
68+
\end{frame}
69+
70+
\section{Introduction to TBB}
71+
\begin{frame}{OpenMP recap}
72+
\begin{itemize}
73+
\item OpenMP is an open standard for shared-memory parallel programming
74+
\item It uses compiler directives (\texttt{\#pragma omp}), runtime functions, and environment variables
75+
\item Simplifies parallel loops and regions for multi-threaded execution
76+
\item Serves as a baseline for comparing other parallel models
77+
\end{itemize}
78+
\end{frame}
79+
80+
\begin{frame}{OpenMP vs TBB}
81+
\begin{itemize}
82+
\item OpenMP:
83+
\begin{itemize}
84+
\item Open standard (not a library) that defines a set of compiler directives, runtime library routines, and environment variables for parallel programming
85+
\item Directive-based, integrated into the compiler (specific implementation is don on the compiler side)
86+
\item Implements thread-level parallelism
87+
\end{itemize}
88+
\item TBB (Threading Building Blocks):
89+
\begin{itemize}
90+
\item C++ template library
91+
\item Uses task-based parallelism with work-stealing scheduling
92+
\item Provides higher-level constructs and generic parallel patterns
93+
\end{itemize}
94+
\item TBB enables more flexible and fine-grained parallelism compared to OpenMP
95+
\end{itemize}
96+
\end{frame}
97+
98+
\begin{frame}{Pros and Cons of TBB}
99+
\begin{columns}
100+
\column{0.5\textwidth}
101+
Pros
102+
\begin{itemize}
103+
\item Task-based parallelism with dynamic work-stealing
104+
\item High-level constructs simplify parallel code
105+
\item Seamless integration with modern C++ and STL
106+
\item Scalable and efficient for fine-grained tasks
107+
\item Portable across different platforms
108+
\end{itemize}
109+
\column{0.5\textwidth}
110+
Cons
111+
\begin{itemize}
112+
\item Steeper learning curve compared to directive-based models
113+
\item Less intuitive for simple loop parallelism
114+
\item Threading issues debugging sometimes can be challenging due to dynamic scheduling
115+
\end{itemize}
116+
\end{columns}
117+
\end{frame}
118+
119+
\begin{frame}{TBB history}
120+
\begin{itemize}
121+
\item Originally developed by Intel to simplify parallel programming in C++
122+
\item Evolved into an open-source library and later integrated into Intel oneAPI
123+
\item Widely adopted in industry and academia for scalable task-based parallelism
124+
\end{itemize}
125+
\end{frame}
126+
127+
\begin{frame}{TBB history timeline}
128+
\begin{itemize}
129+
\item Early 2000s: Conceptual groundwork for task-based parallelism laid at Intel
130+
\item 2006: Initial development of TBB begins for internal projects
131+
\item 2007: First public release of Intel Threading Building Blocks
132+
\item 2010: Major updates introduce improved C++ integration and performance enhancements
133+
\item 2017: TBB is open-sourced, fostering community contributions
134+
\item 2019: Integration into Intel oneAPI, expanding its cross-platform reach
135+
\end{itemize}
136+
\end{frame}
137+
138+
\begin{frame}{TBB fundamentals}
139+
\begin{itemize}
140+
\item A C++ library for task-based parallelism
141+
\item Abstracts low-level thread management and uses a work-stealing scheduler
142+
\item Provides high-level constructs (e.g., \texttt{tbb::parallel\_for}, \texttt{tbb::parallel\_reduce}) that simplify parallel code implementation
143+
\item Promotes writing scalable code by focusing on tasks rather than threads
144+
\end{itemize}
145+
\end{frame}
146+
147+
\section{TBB tasks scheduler}
148+
149+
\begin{frame}{Work-stealing dynamic scheduler}
150+
A work-stealing dynamic scheduler in TBB is a type of scheduling algorithm designed to efficiently balance the workload across multiple threads in a parallel program
151+
152+
The idea: each thread maintains its own queue of tasks. When a thread finishes its own work and becomes idle, instead of waiting, it tries to "steal" tasks from other threads queues to stay productive. This helps to dynamically balance the workload and utilize CPU resources effectively
153+
\end{frame}
154+
155+
\begin{frame}{How TBB Implements Work Stealing}
156+
\begin{itemize}
157+
\item Local Queues:
158+
\begin{itemize}
159+
\item Each worker thread maintains its own local task queue (a double-ended queue)
160+
\item New tasks are added to the bottom (LIFO) for cache-friendly, nested task execution
161+
\end{itemize}
162+
\item Stealing:
163+
\begin{itemize}
164+
\item When a thread's local queue is empty, it becomes a "thief"
165+
\item The thief randomly selects a victim thread and steals a task from the top (FIFO) of its deque
166+
\end{itemize}
167+
\item Minimizing Contention:
168+
\begin{itemize}
169+
\item Most operations are thread-local, avoiding the need for synchronization
170+
\item Stealing operations are synchronized but occur infrequently and in a randomized manner
171+
\end{itemize}
172+
\end{itemize}
173+
\end{frame}
174+
175+
\section{TBB constructs and patterns}
176+
177+
\begin{frame}{TBB Core constructs and patterns}
178+
\begin{itemize}
179+
\item \texttt{tbb::parallel\_for}: Distributes loop iterations among tasks
180+
\item \texttt{tbb::parallel\_reduce}: Performs reductions over a range
181+
\item \texttt{tbb::parallel\_scan}: Computes prefix sums in parallel
182+
\item \texttt{tbb::parallel\_invoke} for tasks: Executes independent functions concurrently
183+
\item \texttt{tbb::task\_group}: For more convenient parallel task management
184+
\item Concurrent containers and synchronization primitives
185+
\end{itemize}
186+
\end{frame}
187+
188+
\begin{frame}[fragile]{tbb::blocked\_range}
189+
\begin{itemize}
190+
\item Purpose: Defines a range of values to be processed in parallel
191+
\item Usage: Commonly used with \texttt{tbb::parallel\_for} and \texttt{parallel\_reduce}
192+
\item Range Specification: Represents a half-open interval \([begin, end)\) and supports an optional grain size
193+
\item Grain Size:
194+
\begin{itemize}
195+
\item Specifies the minimum number of iterations in a subrange.
196+
\item A small grain size increases task granularity, enhancing load balancing but may add overhead.
197+
\item A large grain size reduces overhead but may cause imbalance if work per iteration varies.
198+
\end{itemize}
199+
\end{itemize}
200+
201+
\vspace{0.5em}
202+
203+
\lstset{style=CStyle}
204+
\begin{lstlisting}
205+
#include "tbb/blocked_range.h"
206+
207+
// Example: process a range of indices [0, N)
208+
tbb::blocked_range<size_t> range(0, N, grain_size);
209+
210+
tbb::parallel_for(range, [&](const tbb::blocked_range<size_t>& r) {
211+
for (size_t i = r.begin(); i != r.end(); ++i) {
212+
process(i);
213+
}
214+
});
215+
\end{lstlisting}
216+
\end{frame}
217+
218+
\begin{frame}[fragile]{Parallel loops (\texttt{tbb::parallel\_for})}
219+
\begin{itemize}
220+
\item Splits a loop range into subranges executed in parallel
221+
\item Uses a blocked range to define the iteration space
222+
\end{itemize}
223+
\vspace{0.5em}
224+
\lstset{style=CStyle}
225+
\begin{lstlisting}
226+
#include "tbb/parallel_for.h"
227+
#include "tbb/blocked_range.h"
228+
#include <vector>
229+
230+
std::vector<int> data(N);
231+
232+
tbb::parallel_for(tbb::blocked_range<size_t>(0, N),
233+
[&](const tbb::blocked_range<size_t>& r) {
234+
for(size_t i = r.begin(); i != r.end(); ++i) {
235+
data[i] = compute(i);
236+
}
237+
});
238+
\end{lstlisting}
239+
240+
Or with range-based loop:
241+
242+
\lstset{style=CStyle}
243+
\begin{lstlisting}
244+
#include "tbb/parallel_for.h"
245+
#include "tbb/blocked_range.h"
246+
#include <vector>
247+
248+
std::vector<int> data(N);
249+
250+
tbb::parallel_for(tbb::blocked_range<size_t>(0, N),
251+
[&](const tbb::blocked_range<size_t>& r) {
252+
for(auto i : r) {
253+
data[i] = compute(i);
254+
}
255+
});
256+
\end{lstlisting}
257+
\end{frame}
258+
259+
\begin{frame}[fragile]{Static scheduling with \texttt{static\_partitioner}}
260+
\begin{itemize}
261+
\item TBB uses dynamic work-stealing by default
262+
\item Use \texttt{tbb::static\_partitioner} to enforce a static division of the iteration space
263+
\item This partitioner divides work evenly at the start, which can reduce overhead for regular, balanced workloads
264+
\end{itemize}
265+
266+
\vspace{0.5em}
267+
268+
\lstset{style=CStyle}
269+
\begin{lstlisting}
270+
#include "tbb/parallel_for.h"
271+
#include "tbb/blocked_range.h"
272+
#include "tbb/static_partitioner.h"
273+
#include <vector>
274+
275+
std::vector<int> data(N);
276+
277+
tbb::parallel_for(
278+
tbb::blocked_range<size_t>(0, N),
279+
[&](const tbb::blocked_range<size_t>& r) {
280+
for(auto i : r) {
281+
data[i] = compute(i);
282+
}
283+
},
284+
tbb::static_partitioner() // Use static partitioning
285+
);
286+
\end{lstlisting}
287+
\end{frame}
288+
289+
\begin{frame}[fragile]{Parallel reduction (\texttt{tbb::parallel\_reduce})}
290+
\begin{itemize}
291+
\item Performs reduction (e.g., sum, max) across a range
292+
\item Divides the work and then combines partial results
293+
\end{itemize}
294+
\vspace{0.5em}
295+
\lstset{style=CStyle}
296+
\begin{lstlisting}
297+
#include "tbb/parallel_reduce.h"
298+
#include "tbb/blocked_range.h"
299+
#include <vector>
300+
#include <functional>
301+
302+
std::vector<int> array(N);
303+
304+
int total = tbb::parallel_reduce(
305+
tbb::blocked_range<size_t>(0, N),
306+
0,
307+
[&](const tbb::blocked_range<size_t>& r, int local_sum) -> int {
308+
for(size_t i = r.begin(); i != r.end(); ++i)
309+
local_sum += array[i];
310+
return local_sum;
311+
},
312+
std::plus<int>());
313+
\end{lstlisting}
314+
315+
{\tiny
316+
Supported operations:
317+
\begin{itemize}
318+
\item Sum: \texttt{a + b}
319+
\item Product: \texttt{a * b}
320+
\item Minimum: \texttt{std::min(a, b)}
321+
\item Maximum: \texttt{std::max(a, b)}
322+
\item Logical AND / OR: \texttt{a \&\& b} and \texttt{a || b}
323+
\item Custom operations: e.g., merging data structures or combining histograms
324+
\end{itemize}
325+
}
326+
\end{frame}
327+
328+
\begin{frame}[fragile]{TBB tasks}
329+
\begin{itemize}
330+
\item TBB tasks represent independent units of work
331+
\item They allow fine-grained parallelism and dynamic scheduling
332+
\end{itemize}
333+
\vspace{0.5em}
334+
\lstset{style=CStyle}
335+
\begin{lstlisting}
336+
#include "tbb/parallel_invoke.h"
337+
338+
tbb::parallel_invoke(
339+
[](){ compute_task_A(); },
340+
[](){ compute_task_B(); },
341+
[](){ compute_task_C(); }
342+
);
343+
\end{lstlisting}
344+
\end{frame}
345+
346+
\begin{frame}[fragile]{TBB task groups}
347+
\begin{itemize}
348+
\item The \texttt{task\_group} interface simplifies tasks management
349+
\item It allows to launch and wait for a bunch of tasks
350+
\end{itemize}
351+
\vspace{0.5em}
352+
\lstset{style=CStyle}
353+
\begin{lstlisting}
354+
#include "tbb/task_group.h"
355+
356+
void compute() {
357+
tbb::task_group tg;
358+
tg.run([](){ compute_task_A(); });
359+
tg.run([](){ compute_task_B(); });
360+
tg.wait(); // Wait for both tasks to complete
361+
}
362+
\end{lstlisting}
363+
\end{frame}
364+
365+
\begin{frame}[fragile]{Scan algorithm (\texttt{tbb::parallel\_scan})}
366+
\begin{itemize}
367+
\item Useful for parallel prefix sum (scan) operations
368+
\item Supports both inclusive and exclusive scans
369+
\end{itemize}
370+
\vspace{0.5em}
371+
\lstset{style=CStyle}
372+
\begin{lstlisting}
373+
#include "tbb/parallel_scan.h"
374+
#include "tbb/blocked_range.h"
375+
#include <vector>
376+
#include <numeric>
377+
378+
std::vector<int> data(N);
379+
int initial = 0;
380+
381+
tbb::parallel_scan(
382+
tbb::blocked_range<size_t>(0, N),
383+
initial,
384+
[&](const tbb::blocked_range<size_t>& r, int running_total, bool is_final_scan) -> int {
385+
for(size_t i = r.begin(); i != r.end(); ++i) {
386+
running_total += data[i];
387+
if(is_final_scan)
388+
data[i] = running_total;
389+
}
390+
return running_total;
391+
},
392+
std::plus<int>());
393+
\end{lstlisting}
394+
\end{frame}
395+
396+
\section{Synchronization}
397+
398+
\begin{frame}{Synchronization}
399+
\begin{itemize}
400+
\item TBB provides synchronization primitives such as:
401+
\begin{itemize}
402+
\item \texttt{tbb::mutex} and \texttt{tbb::spin\_mutex} for mutual exclusion
403+
\item Atomic operations and concurrent containers for lock-free data access
404+
\end{itemize}
405+
\item The runtime work-stealing scheduler minimizes contention by dynamically balancing tasks
406+
\end{itemize}
407+
\end{frame}
408+
409+
\begin{frame}[fragile]{Mutex (\texttt{tbb::mutex})}
410+
\begin{itemize}
411+
\item Provides mutual exclusion for protecting shared data
412+
\item Uses \texttt{RAII} (resource acquisition is initialization paradigm) with \texttt{scoped\_lock} to automatically manage locking
413+
\item Lightweight and efficient for fine-grained synchronization
414+
\end{itemize}
415+
416+
\vspace{0.5em}
417+
418+
\lstset{style=CStyle}
419+
\begin{lstlisting}
420+
#include "tbb/mutex.h"
421+
#include <vector>
422+
423+
tbb::mutex m;
424+
std::vector<int> shared_data;
425+
426+
void update_data(int value) {
427+
tbb::mutex::scoped_lock lock(m); // Lock acquired
428+
shared_data.push_back(value);
429+
// Lock released automatically when 'lock' goes out of scope
430+
}
431+
\end{lstlisting}
432+
\end{frame}
433+
434+
\begin{frame}{What about the barrier in TBB?}
435+
\begin{itemize}
436+
\item Unlike OpenMP, TBB does not provide an explicit barrier construct
437+
\item Implicit Synchronization:
438+
\begin{itemize}
439+
\item TBB parallel algorithms (e.g., \texttt{tbb::parallel\_for}, \texttt{tbb::parallel\_reduce}) return only after all tasks are complete
440+
\item This behavior naturally synchronizes work without needing an explicit barrier
441+
\end{itemize}
442+
\item Task Group Synchronization:
443+
\begin{itemize}
444+
\item When using \texttt{tbb::task\_group}, call \texttt{wait()} to ensure all spawned tasks have finished
445+
\end{itemize}
446+
\item Design Philosophy: TBB's task-based model minimizes the need for explicit synchronization, improving scalability and reducing overhead
447+
\end{itemize}
448+
\end{frame}
449+
450+
\section{Brief overview of asvanced features}
451+
452+
\begin{frame}{Brief advanced TBB features overview}
453+
\begin{itemize}
454+
\item Pipelines and Flow Graphs:
455+
\begin{itemize}
456+
\item \texttt{tbb::flow::graph}: Build complex workflows using nodes
457+
\item Example: Create a pipeline that processes data through \texttt{function\_node}, \texttt{buffer\_node}, and \texttt{join\_node} to orchestrate task dependencies
458+
\end{itemize}
459+
\item Thread-safe data structures
460+
\begin{itemize}
461+
\item \texttt{tbb::concurrent\_vector}: Dynamic array with concurrent push-backs
462+
\item \texttt{tbb::concurrent\_hash\_map}: High-performance hash table for concurrent access
463+
\item \texttt{tbb::concurrent\_queue}: Lock-free queue
464+
\item \texttt{tbb::concurrent\_unordered\_map}: Unordered map optimized for parallel workloads
465+
\end{itemize}
466+
\item Scalable Memory Allocation optimized for parallel environments:
467+
\begin{itemize}
468+
\item \texttt{tbb::scalable\_allocator}: Can be used with STL containers to reduce memory contention
469+
\item Example: Using \texttt{tbb::scalable\_allocator} with a \texttt{std::vector} for improved allocation performance in multi-threaded scenarios
470+
\end{itemize}
471+
\item and others\dots
472+
\end{itemize}
473+
\end{frame}
474+
475+
\section{Performance comparison}
476+
477+
\begin{frame}{Performance: TBB vs OpenMP}
478+
\begin{itemize}
479+
\item Performance factors:
480+
\begin{itemize}
481+
\item TBB dynamic scheduling may introduce overhead for fine-grained tasks
482+
\item OpenMP static scheduling can be more efficient for uniform workloads
483+
\end{itemize}
484+
\item Scalability and load balancing:
485+
\begin{itemize}
486+
\item TBB deals significantly better with unbalanced workloads with its work-stealing scheduler
487+
\item OpenMP may perform better in highly regular, compute-intensive loops
488+
\end{itemize}
489+
\item Optimization and tuning:
490+
\begin{itemize}
491+
\item Both TBB and OpenMP are highly optimized
492+
\item Real-world performance is case-dependent: benchmarking on target hardware and specific tasks is essential
493+
\end{itemize}
494+
\end{itemize}
495+
\end{frame}
496+
497+
\begin{frame}
498+
\centering
499+
\Huge{Thank You!}
500+
\end{frame}
501+
502+
\begin{frame}{References}
503+
\begin{itemize}
504+
\item oneAPI Threading Building Blocks GitHub repository: \url{https://github.com/uxlfoundation/oneTBB}
505+
\item oneAPI Threading Building Blocks (oneTBB) documentation: \url{oneAPI Threading Building Blocks (oneTBB)}
506+
\item CppCon 2015: Pablo Halpern "Work Stealing": \url{https://www.youtube.com/watch?v=iLHNF7SgVN4}
507+
\item Pushing the limits of work-stealing: \url{https://community.intel.com/legacyfs/online/drupal_files/managed/9d/48/ConfAnton-Pushing-the-limits-of-work-stealing-approved.pdf}
508+
\end{itemize}
509+
\end{frame}
510+
511+
\end{document}

‎08-tbb/08-tbb.toc‎

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
\beamer@sectionintoc {1}{Introduction to TBB}{3}{0}{1}
2+
\beamer@sectionintoc {2}{TBB tasks scheduler}{9}{0}{2}
3+
\beamer@sectionintoc {3}{TBB constructs and patterns}{11}{0}{3}
4+
\beamer@sectionintoc {4}{Synchronization}{19}{0}{4}
5+
\beamer@sectionintoc {5}{Brief overview of asvanced features}{22}{0}{5}
6+
\beamer@sectionintoc {6}{Performance comparison}{23}{0}{6}

‎index.html‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ <h1>Parallel Programming Course Slides</h1>
5454
<li><a href="https://learning-process.github.io/parallel_programming_slides/slides/05-parallelism-practice.pdf" target="_blank">05: Parallelism practice</a></li>
5555
<li><a href="https://learning-process.github.io/parallel_programming_slides/slides/06-admin-intro-threads.pdf" target="_blank">06: Administrative questions - Threading</a></li>
5656
<li><a href="https://learning-process.github.io/parallel_programming_slides/slides/07-openmp.pdf" target="_blank">07: OpenMP</a></li>
57+
<li><a href="https://learning-process.github.io/parallel_programming_slides/slides/08-tbb.pdf" target="_blank">08: TBB</a></li>
5758
</ul>
5859
</body>
5960
</html>

0 commit comments

Comments
 (0)
Please sign in to comment.