Skip to content

Commit f6cd900

Browse files
authored
[SYCL][NFC] Extract range rounding logic (#20893)
Extract the range rounding logic to a separate header, so it can be used in both the handler and the queue (once the handler-less kernel submission logic uses range rounding).
1 parent 6aa510d commit f6cd900

14 files changed

+370
-309
lines changed
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
//==----------- range_rounding.hpp --- SYCL range rounding utils -----------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
11+
#include <sycl/detail/cg_types.hpp>
12+
#include <sycl/detail/export.hpp>
13+
#include <sycl/detail/helpers.hpp>
14+
#include <sycl/detail/iostream_proxy.hpp>
15+
#include <sycl/device.hpp>
16+
#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
17+
#include <sycl/id.hpp>
18+
#include <sycl/item.hpp>
19+
#include <sycl/kernel_handler.hpp>
20+
#include <sycl/range.hpp>
21+
22+
#include <tuple>
23+
#include <type_traits>
24+
25+
#include <stddef.h>
26+
27+
namespace sycl {
28+
inline namespace _V1 {
29+
30+
namespace detail {
31+
32+
template <int Dims> class RoundedRangeIDGenerator {
33+
id<Dims> Id;
34+
id<Dims> InitId;
35+
range<Dims> UserRange;
36+
range<Dims> RoundedRange;
37+
bool Done = false;
38+
39+
public:
40+
RoundedRangeIDGenerator(const id<Dims> &Id, const range<Dims> &UserRange,
41+
const range<Dims> &RoundedRange)
42+
: Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) {
43+
for (int i = 0; i < Dims; ++i)
44+
if (Id[i] >= UserRange[i])
45+
Done = true;
46+
}
47+
48+
explicit operator bool() { return !Done; }
49+
50+
void updateId() {
51+
for (int i = 0; i < Dims; ++i) {
52+
Id[i] += RoundedRange[i];
53+
if (Id[i] < UserRange[i])
54+
return;
55+
Id[i] = InitId[i];
56+
}
57+
Done = true;
58+
}
59+
60+
id<Dims> getId() { return Id; }
61+
62+
template <typename KernelType> auto getItem() {
63+
if constexpr (std::is_invocable_v<KernelType, item<Dims> &> ||
64+
std::is_invocable_v<KernelType, item<Dims> &, kernel_handler>)
65+
return detail::Builder::createItem<Dims, true>(UserRange, getId(), {});
66+
else {
67+
static_assert(std::is_invocable_v<KernelType, item<Dims, false> &> ||
68+
std::is_invocable_v<KernelType, item<Dims, false> &,
69+
kernel_handler>,
70+
"Kernel must be invocable with an item!");
71+
return detail::Builder::createItem<Dims, false>(UserRange, getId());
72+
}
73+
}
74+
};
75+
76+
// TODO: The wrappers can be optimized further so that the body
77+
// essentially looks like this:
78+
// for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2))
79+
// for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1))
80+
// for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0))
81+
// KernelFunc({x,y,z});
82+
template <typename TransformedArgType, int Dims, typename KernelType>
83+
class RoundedRangeKernel {
84+
public:
85+
range<Dims> UserRange;
86+
KernelType KernelFunc;
87+
void operator()(item<Dims> It) const {
88+
auto RoundedRange = It.get_range();
89+
for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
90+
Gen.updateId()) {
91+
auto item = Gen.template getItem<KernelType>();
92+
KernelFunc(item);
93+
}
94+
}
95+
96+
// Copy the properties_tag getter from the original kernel to propagate
97+
// property(s)
98+
template <
99+
typename T = KernelType,
100+
typename = std::enable_if_t<ext::oneapi::experimental::detail::
101+
HasKernelPropertiesGetMethod<T>::value>>
102+
auto get(ext::oneapi::experimental::properties_tag) const {
103+
return KernelFunc.get(ext::oneapi::experimental::properties_tag{});
104+
}
105+
};
106+
107+
template <typename TransformedArgType, int Dims, typename KernelType>
108+
class RoundedRangeKernelWithKH {
109+
public:
110+
range<Dims> UserRange;
111+
KernelType KernelFunc;
112+
void operator()(item<Dims> It, kernel_handler KH) const {
113+
auto RoundedRange = It.get_range();
114+
for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
115+
Gen.updateId()) {
116+
auto item = Gen.template getItem<KernelType>();
117+
KernelFunc(item, KH);
118+
}
119+
}
120+
121+
// Copy the properties_tag getter from the original kernel to propagate
122+
// property(s)
123+
template <
124+
typename T = KernelType,
125+
typename = std::enable_if_t<ext::oneapi::experimental::detail::
126+
HasKernelPropertiesGetMethod<T>::value>>
127+
auto get(ext::oneapi::experimental::properties_tag) const {
128+
return KernelFunc.get(ext::oneapi::experimental::properties_tag{});
129+
}
130+
};
131+
132+
template <typename WrapperT, typename TransformedArgType, int Dims,
133+
typename KernelType,
134+
std::enable_if_t<detail::KernelLambdaHasKernelHandlerArgT<
135+
KernelType, TransformedArgType>::value> * = nullptr>
136+
auto getRangeRoundedKernelLambda(KernelType KernelFunc, range<Dims> UserRange) {
137+
return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims, KernelType>{
138+
UserRange, KernelFunc};
139+
}
140+
141+
template <typename WrapperT, typename TransformedArgType, int Dims,
142+
typename KernelType,
143+
std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
144+
KernelType, TransformedArgType>::value> * = nullptr>
145+
auto getRangeRoundedKernelLambda(KernelType KernelFunc, range<Dims> UserRange) {
146+
return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
147+
UserRange, KernelFunc};
148+
}
149+
150+
void __SYCL_EXPORT GetRangeRoundingSettings(size_t &MinFactor,
151+
size_t &GoodFactor,
152+
size_t &MinRange);
153+
154+
std::tuple<std::array<size_t, 3>, bool>
155+
__SYCL_EXPORT getMaxWorkGroups(const device &Device);
156+
157+
bool __SYCL_EXPORT DisableRangeRounding();
158+
159+
bool __SYCL_EXPORT RangeRoundingTrace();
160+
161+
template <int Dims>
162+
std::tuple<range<Dims>, bool> getRoundedRange(range<Dims> UserRange,
163+
const device &Device) {
164+
range<Dims> RoundedRange = UserRange;
165+
// Disable the rounding-up optimizations under these conditions:
166+
// 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
167+
// 2. The kernel is provided via an interoperability method (this uses a
168+
// different code path).
169+
// 3. The range is already a multiple of the rounding factor.
170+
//
171+
// Cases 2 and 3 could be supported with extra effort.
172+
// As an optimization for the common case it is an
173+
// implementation choice to not support those scenarios.
174+
// Note that "this_item" is a free function, i.e. not tied to any
175+
// specific id or item. When concurrent parallel_fors are executing
176+
// on a device it is difficult to tell which parallel_for the call is
177+
// being made from. One could replicate portions of the
178+
// call-graph to make this_item calls kernel-specific but this is
179+
// not considered worthwhile.
180+
181+
// Perform range rounding if rounding-up is enabled.
182+
if (DisableRangeRounding())
183+
return {range<Dims>{}, false};
184+
185+
// Range should be a multiple of this for reasonable performance.
186+
size_t MinFactorX = 16;
187+
// Range should be a multiple of this for improved performance.
188+
size_t GoodFactor = 32;
189+
// Range should be at least this to make rounding worthwhile.
190+
size_t MinRangeX = 1024;
191+
192+
// Check if rounding parameters have been set through environment:
193+
// SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
194+
GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX);
195+
196+
// In SYCL, each dimension of a global range size is specified by
197+
// a size_t, which can be up to 64 bits. All backends should be
198+
// able to accept a kernel launch with a 32-bit global range size
199+
// (i.e. do not throw an error). The OpenCL CPU backend will
200+
// accept every 64-bit global range, but the GPU backends will not
201+
// generally accept every 64-bit global range. So, when we get a
202+
// non-32-bit global range, we wrap the old kernel in a new kernel
203+
// that has each work item perform multiple invocations the old
204+
// kernel in a 32-bit global range.
205+
id<Dims> MaxNWGs = [&] {
206+
auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups(Device);
207+
if (!HasMaxWGs) {
208+
id<Dims> Default;
209+
for (int i = 0; i < Dims; ++i)
210+
Default[i] = (std::numeric_limits<int32_t>::max)();
211+
return Default;
212+
}
213+
214+
id<Dims> IdResult;
215+
size_t Limit = (std::numeric_limits<int>::max)();
216+
for (int i = 0; i < Dims; ++i)
217+
IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]);
218+
return IdResult;
219+
}();
220+
auto M = (std::numeric_limits<uint32_t>::max)();
221+
range<Dims> MaxRange;
222+
for (int i = 0; i < Dims; ++i) {
223+
auto DesiredSize = MaxNWGs[i] * GoodFactor;
224+
MaxRange[i] =
225+
DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor;
226+
}
227+
228+
bool DidAdjust = false;
229+
auto Adjust = [&](int Dim, size_t Value) {
230+
if (RangeRoundingTrace())
231+
std::cout << "parallel_for range adjusted at dim " << Dim << " from "
232+
<< RoundedRange[Dim] << " to " << Value << std::endl;
233+
RoundedRange[Dim] = Value;
234+
DidAdjust = true;
235+
};
236+
237+
#ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
238+
size_t GoodExpFactor = 1;
239+
switch (Dims) {
240+
case 1:
241+
GoodExpFactor = 32; // Make global range multiple of {32}
242+
break;
243+
case 2:
244+
GoodExpFactor = 16; // Make global range multiple of {16, 16}
245+
break;
246+
case 3:
247+
GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
248+
break;
249+
}
250+
251+
// Check if rounding parameters have been set through environment:
252+
// SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
253+
GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);
254+
255+
for (auto i = 0; i < Dims; ++i)
256+
if (UserRange[i] % GoodExpFactor) {
257+
Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
258+
}
259+
#else
260+
// Perform range rounding if there are sufficient work-items to
261+
// need rounding and the user-specified range is not a multiple of
262+
// a "good" value.
263+
if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) {
264+
// It is sufficient to round up just the first dimension.
265+
// Multiplying the rounded-up value of the first dimension
266+
// by the values of the remaining dimensions (if any)
267+
// will yield a rounded-up value for the total range.
268+
Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
269+
}
270+
#endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
271+
#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
272+
// If we are forcing range rounding kernels to be used, we always want the
273+
// rounded range kernel to be generated, even if rounding isn't needed
274+
DidAdjust = true;
275+
#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
276+
277+
for (int i = 0; i < Dims; ++i)
278+
if (RoundedRange[i] > MaxRange[i])
279+
Adjust(i, MaxRange[i]);
280+
281+
if (!DidAdjust)
282+
return {range<Dims>{}, false};
283+
return {RoundedRange, true};
284+
}
285+
286+
} // namespace detail
287+
} // namespace _V1
288+
} // namespace sycl

0 commit comments

Comments
 (0)