Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,3 +479,9 @@ def check_array(array, exp_halo, exp_shape, rotate=False):

assert tuple(array.halo) == exp_halo
assert tuple(shape) == tuple(exp_shape)


# Main body in Operator IET, depending on ISA
def body0(op):
bidx = 0 if 'sse' not in configuration['platform'].known_isas else 1
return op.body.body[bidx]
30 changes: 22 additions & 8 deletions devito/passes/clusters/aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
CustomDimension, Eq, Hyperplane, IncrDimension, Indexed, ModuloDimension, Size,
StencilDimension, Symbol, Temp, TempArray, TempFunction
)
from devito.types.dimension import ConditionalDimension, SubsamplingFactor
from devito.types.grid import MultiSubDimension

__all__ = ['cire']
Expand Down Expand Up @@ -126,6 +127,8 @@ def _aliases_from_clusters(self, cgroup, exclude, meta):
for mapper in self._generate(cgroup, exclude):
# Clusters -> AliasList
found = collect(mapper.extracted, meta.ispace, self.opt_minstorage)
if not found:
continue
exprs, aliases = self._choose(found, cgroup, mapper)

# AliasList -> Schedule
Expand All @@ -146,7 +149,7 @@ def _aliases_from_clusters(self, cgroup, exclude, meta):
# Schedule -> [Clusters]_k
processed, subs = lower_schedule(schedule, meta, self.sregistry,
self.opt_ftemps, self.opt_min_dtype,
self.opt_minmem)
self.opt_minmem, nclusters=len(cgroup))

# [Clusters]_k -> [Clusters]_k (optimization)
if self.opt_multisubdomain:
Expand Down Expand Up @@ -271,7 +274,6 @@ def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None):
free_symbols = i.free_symbols
if {a.function for a in free_symbols} & exclude:
continue

mapper.add(i, make, terms)

return mapper
Expand Down Expand Up @@ -852,7 +854,7 @@ def optimize_schedule_rotations(schedule, sregistry):


def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
opt_minmem):
opt_minmem, nclusters=1):
"""
Turn a Schedule into a sequence of Clusters.
"""
Expand Down Expand Up @@ -923,15 +925,26 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
callback = lambda idx: obj[ # noqa: B023
[i + s for i, s in zip(idx, shift, strict=True)] # noqa: B023
]
guards = meta.guards
else:
# Degenerate case: scalar expression
assert writeto.size == 0

dtype = sympy_dtype(pivot, base=meta.dtype, smin=opt_min_dtype)
obj = Temp(name=name, dtype=dtype)
expression = Eq(obj, uxreplace(pivot, subs))
is_cond = any(isinstance(d, (SubsamplingFactor, ConditionalDimension))
for d in pivot.free_symbols)
if meta.guards and is_cond and nclusters > 1:
# Scalar alias that depends on a guard, unsafe to lift out of the guard
# Do not alias
expression = None
callback = lambda idx: uxreplace(pivot, subs) # noqa: B023
else:
dtype = sympy_dtype(pivot, base=meta.dtype, smin=opt_min_dtype)
obj = Temp(name=name, dtype=dtype, is_const=True)
expression = Eq(obj, uxreplace(pivot, subs))

callback = lambda idx: obj # noqa: B023
callback = lambda idx: obj # noqa: B023
# Only keep the guard if there is no cross-cluster reuse of the scalar
guards = meta.guards if nclusters == 1 else None

# Create the substitution rules for the aliasing expressions
subs.update({
Expand All @@ -957,7 +970,8 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
properties[Hyperplane(writeto.itdims)] = {SEPARABLE}

# Finally, build the alias Cluster
clusters.append(Cluster(expression, ispace, meta.guards, properties))
if expression is not None:
clusters.append(Cluster(expression, ispace, guards, properties))

return clusters, subs

Expand Down
11 changes: 9 additions & 2 deletions devito/passes/clusters/cse.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ def cse_dtype(exprdtype, cdtype):
Return the dtype of a CSE temporary given the dtype of the expression to be
captured and the cluster's dtype.
"""
if np.issubdtype(cdtype, np.floating) and np.issubdtype(exprdtype, np.integer):
# Integer expression and floating-point cluster: promote to the floating point
# np.promote_types upcast integers (e.g int32 -> Float64) so we
# need to ensure that the promoted type is not larger than the cluster's dtype
return cdtype

if np.issubdtype(cdtype, np.complexfloating):
return np.promote_types(exprdtype, cdtype(0).real.__class__).type
else:
Expand Down Expand Up @@ -97,8 +103,9 @@ def cse(cluster, sregistry=None, options=None, **kwargs):
if cluster.is_fence:
return cluster

make_dtype = lambda e: cse_dtype(e.dtype, dtype)
make = lambda e: CTemp(name=sregistry.make_name(), dtype=make_dtype(e))
def make(e):
edtype = cse_dtype(e.dtype, dtype)
return CTemp(name=sregistry.make_name(), dtype=edtype)

exprs = _cse(cluster, make, min_cost=min_cost, mode=mode)

Expand Down
9 changes: 8 additions & 1 deletion devito/passes/clusters/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,14 @@ def callback(self, clusters, prefix):
# unless the guard is for an outer dimension
guards = {} if c.is_scalar and not (prefix[:-1] and c.guards) else c.guards

lifted.append(c.rebuild(ispace=ispace, properties=properties, guards=guards))
_lifted = c.rebuild(ispace=ispace, properties=properties, guards=guards)
if guards and clusters[max(n-1, 0)].guards != guards and _lifted.is_scalar:
# Heuristic: if the lifted Cluster has different guards than the
# previous one, then we are likely to end up with a separate
# Cluster, hence give up on lifting
processed.append(_lifted)
else:
lifted.append(_lifted)

return lifted + processed

Expand Down
2 changes: 1 addition & 1 deletion devito/types/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1589,7 +1589,7 @@ def _rebuild(self, *args, **kwargs):
comps = [f.func(*args, name=f.name.replace(self.name, newname), **kwargs)
for f in self.flat()]
# Rebuild the matrix with the new components
return self._new(comps)
return self._new(*self.shape, comps)

func = _rebuild

Expand Down
2 changes: 1 addition & 1 deletion examples/mpi/overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@
" _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);\n",
" _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);\n",
"\n",
" float r0 = 1.0F/h_x;\n",
" const float r0 = 1.0F/h_x;\n",
"\n",
" for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n",
" {\n",
Expand Down
57 changes: 35 additions & 22 deletions examples/performance/00_overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@
"}\n",
"STOP(section0,timers)\n",
"\n",
"float r1 = 1.0F/h_y;\n",
"const float r1 = 1.0F/h_y;\n",
"\n",
"for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n",
"{\n",
Expand Down Expand Up @@ -572,7 +572,13 @@
"+ u[t1][x + 4][y + 4][z + 4] = (f[x + 1][y + 1][z + 1]*f[x + 1][y + 1][z + 1])*((-6.66666667e-1F*r0)*(8.33333333e-2F*r0*u[t0][x + 4][y + 1][z + 4] - 6.66666667e-1F*r0*u[t0][x + 4][y + 2][z + 4] + 6.66666667e-1F*r0*u[t0][x + 4][y + 4][z + 4] - 8.33333333e-2F*r0*u[t0][x + 4][y + 5][z + 4]) + (-8.33333333e-2F*r0)*(8.33333333e-2F*r0*u[t0][x + 4][y + 4][z + 4] - 6.66666667e-1F*r0*u[t0][x + 4][y + 5][z + 4] + 6.66666667e-1F*r0*u[t0][x + 4][y + 7][z + 4] - 8.33333333e-2F*r0*u[t0][x + 4][y + 8][z + 4]) + (8.33333333e-2F*r0)*(8.33333333e-2F*r0*u[t0][x + 4][y][z + 4] - 6.66666667e-1F*r0*u[t0][x + 4][y + 1][z + 4] + 6.66666667e-1F*r0*u[t0][x + 4][y + 3][z + 4] - 8.33333333e-2F*r0*u[t0][x + 4][y + 4][z + 4]) + (6.66666667e-1F*r0)*(8.33333333e-2F*r0*u[t0][x + 4][y + 3][z + 4] - 6.66666667e-1F*r0*u[t0][x + 4][y + 4][z + 4] + 6.66666667e-1F*r0*u[t0][x + 4][y + 6][z + 4] - 8.33333333e-2F*r0*u[t0][x + 4][y + 7][z + 4]))*sinf(f[x + 1][y + 1][z + 1]);\n",
" }\n",
" }\n",
" }\n",
" }\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
Expand Down Expand Up @@ -652,7 +658,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -712,7 +718,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -753,7 +759,14 @@
" }\n",
" }\n",
" STOP(section0,timers)\n",
"}\n"
"}"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
Expand All @@ -772,7 +785,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -863,7 +876,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -919,7 +932,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 16,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -976,7 +989,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -994,7 +1007,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1044,7 +1057,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1112,7 +1125,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1192,7 +1205,7 @@
" }\n",
" STOP(section0,timers)\n",
"\n",
" float r1 = 1.0F/h_y;\n",
" const float r1 = 1.0F/h_y;\n",
"\n",
" for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n",
" {\n",
Expand Down Expand Up @@ -1279,7 +1292,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 21,
"metadata": {},
"outputs": [
{
Expand All @@ -1304,7 +1317,7 @@
"}\n",
"STOP(section0,timers)\n",
"\n",
"float r1 = 1.0F/h_y;\n",
"const float r1 = 1.0F/h_y;\n",
"\n",
"for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n",
"{\n",
Expand Down Expand Up @@ -1369,7 +1382,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 22,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1404,7 +1417,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 23,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1483,7 +1496,7 @@
" }\n",
" STOP(section0,timers)\n",
"\n",
" float r1 = 1.0F/h_y;\n",
" const float r1 = 1.0F/h_y;\n",
"\n",
" for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n",
" {\n",
Expand Down Expand Up @@ -1546,7 +1559,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 24,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1624,8 +1637,8 @@
" }\n",
" STOP(section0,timers)\n",
"\n",
" float r1 = 1.0F/h_x;\n",
" float r2 = 1.0F/h_y;\n",
" const float r1 = 1.0F/h_x;\n",
" const float r2 = 1.0F/h_y;\n",
"\n",
" for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n",
" {\n",
Expand Down Expand Up @@ -1718,7 +1731,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.13.11"
}
},
"nbformat": 4,
Expand Down
17 changes: 11 additions & 6 deletions examples/performance/01_gpu.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,13 @@
"name": "stderr",
"output_type": "stream",
"text": [
"NUMA domain count autodetection failed, assuming 1\n",
"Operator `Kernel` ran in 0.01 s\n",
"NUMA domain count autodetection failed, assuming 1\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Operator `Kernel` ran in 0.01 s\n"
]
}
Expand Down Expand Up @@ -292,9 +297,9 @@
" const int x_stride0 = x_fsz0*y_fsz0;\n",
" const int y_stride0 = y_fsz0;\n",
"\n",
" float r0 = 1.0F/dt;\n",
" float r1 = 1.0F/(h_x*h_x);\n",
" float r2 = 1.0F/(h_y*h_y);\n",
" const float r0 = 1.0F/dt;\n",
" const float r1 = 1.0F/(h_x*h_x);\n",
" const float r2 = 1.0F/(h_y*h_y);\n",
"\n",
" for (int time = time_m; time <= time_M; time += 1)\n",
" {\n",
Expand Down Expand Up @@ -340,7 +345,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
"version": "3.13.11"
}
},
"nbformat": 4,
Expand Down
Loading
Loading