Skip to content

Commit 21abc72

Browse files
Add vector length to kernel helper
1 parent 38a8f03 commit 21abc72

15 files changed

+495
-35
lines changed

src/advection.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ __host__ int advection_le_2nd(advflux_t * flux, hydro_t * hydro,
637637
dim3 nblk = {};
638638
dim3 ntpb = {};
639639
cs_limits_t lim = {1, nlocal[X], 0, nlocal[Y], 0, nlocal[Z]};
640-
kernel_3d_v_t k3v = kernel_3d_v(flux->cs, lim);
640+
kernel_3d_v_t k3v = kernel_3d_v(flux->cs, lim, NSIMDVL);
641641

642642
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
643643

@@ -901,7 +901,7 @@ __host__ int advection_le_3rd(advflux_t * flux, hydro_t * hydro,
901901
dim3 nblk = {};
902902
dim3 ntpb = {};
903903
cs_limits_t lim = {1, nlocal[X], 0, nlocal[Y], 0, nlocal[Z]};
904-
kernel_3d_v_t k3v = kernel_3d_v(flux->cs, lim);
904+
kernel_3d_v_t k3v = kernel_3d_v(flux->cs, lim, NSIMDVL);
905905

906906
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
907907
lees_edw_target(flux->le, &letarget);
@@ -1467,7 +1467,7 @@ __host__ int advflux_cs_compute(advflux_t * flux, hydro_t * h, field_t * f) {
14671467
{
14681468
dim3 nblk = {};
14691469
dim3 ntpb = {};
1470-
kernel_3d_v_t k3v = kernel_3d_v(flux->cs, lim);
1470+
kernel_3d_v_t k3v = kernel_3d_v(flux->cs, lim, NSIMDVL);
14711471

14721472
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
14731473

src/advection_bcs.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ int advection_bcs_no_normal_flux(advflux_t * flux, map_t * map) {
5252
dim3 nblk = {};
5353
dim3 ntpb = {};
5454
cs_limits_t lim = {1, nlocal[X], 0, nlocal[Y], 0, nlocal[Z]};
55-
kernel_3d_v_t k3v = kernel_3d_v(flux->cs, lim);
55+
kernel_3d_v_t k3v = kernel_3d_v(flux->cs, lim, NSIMDVL);
5656

5757
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
5858

src/blue_phase_beris_edwards.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ __host__ int beris_edw_update_driver(beris_edw_t * be,
509509
dim3 nblk = {};
510510
dim3 ntpb = {};
511511
cs_limits_t lim = {1, nlocal[X], 1, nlocal[Y], 1, nlocal[Z]};
512-
kernel_3d_v_t k3v = kernel_3d_v(be->cs, lim);
512+
kernel_3d_v_t k3v = kernel_3d_v(be->cs, lim, NSIMDVL);
513513

514514
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
515515

@@ -918,7 +918,7 @@ __host__ int beris_edw_h_driver(beris_edw_t * be, fe_t * fe) {
918918
dim3 nblk = {};
919919
dim3 ntpb = {};
920920
cs_limits_t lim = {1, nlocal[X], 1, nlocal[Y], 1, nlocal[Z]};
921-
kernel_3d_v_t k3v = kernel_3d_v(be->cs, lim);
921+
kernel_3d_v_t k3v = kernel_3d_v(be->cs, lim, NSIMDVL);
922922

923923
TIMER_start(TIMER_BE_MOL_FIELD);
924924

src/collision.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ __host__ int lb_collision_mrt(lb_t * lb, hydro_t * hydro, map_t * map,
186186
dim3 nblk = {};
187187
dim3 ntpb = {};
188188
cs_limits_t lim = {1, nlocal[X], 1, nlocal[Y], 1, nlocal[Z]};
189-
kernel_3d_v_t k3v = kernel_3d_v(lb->cs, lim);
189+
kernel_3d_v_t k3v = kernel_3d_v(lb->cs, lim, NSIMDVL);
190190

191191
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
192192

@@ -618,7 +618,7 @@ __host__ int lb_collision_binary(lb_t * lb, hydro_t * hydro, noise_t * noise,
618618
dim3 nblk = {};
619619
dim3 ntpb = {};
620620
cs_limits_t lim = {1, nlocal[X], 1, nlocal[Y], 1, nlocal[Z]};
621-
kernel_3d_v_t k3v = kernel_3d_v(lb->cs, lim);
621+
kernel_3d_v_t k3v = kernel_3d_v(lb->cs, lim, NSIMDVL);
622622

623623
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
624624

src/gradient_3d_7pt_fluid.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
* Edinburgh Soft Matter and Statistical Physics Group and
2626
* Edinburgh Parallel Computing Centre
2727
*
28-
* (c) 2010-2016 The University of Edinburgh
28+
* (c) 2010-2024 The University of Edinburgh
2929
*
3030
* Contributing authors:
3131
* Kevin Stratford ([email protected])
@@ -202,7 +202,7 @@ __host__ int grad_3d_7pt_fluid_operator(cs_t * cs, lees_edw_t * le,
202202
.jmin = 1 - nextra, .jmax = nlocal[Y] + nextra,
203203
.kmin = 1 - nextra, .kmax = nlocal[Z] + nextra
204204
};
205-
kernel_3d_v_t k3v = kernel_3d_v(cs, lim);
205+
kernel_3d_v_t k3v = kernel_3d_v(cs, lim, NSIMDVL);
206206

207207
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
208208

@@ -471,7 +471,7 @@ __host__ int grad_3d_7pt_dab_compute(cs_t * cs, lees_edw_t * le,
471471
.jmin = 1 - nextra, .jmax = nlocal[Y] + nextra,
472472
.kmin = 1 - nextra, .kmax = nlocal[Z] + nextra
473473
};
474-
kernel_3d_v_t k3v = kernel_3d_v(cs, lim);
474+
kernel_3d_v_t k3v = kernel_3d_v(cs, lim, NSIMDVL);
475475

476476
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
477477

src/hydro.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -846,7 +846,7 @@ __host__ int hydro_correct_momentum(hydro_t * hydro) {
846846
dim3 nblk = {};
847847
dim3 ntpb = {};
848848
cs_limits_t lim = {1, nlocal[X], 1, nlocal[Y], 1, nlocal[Z]};
849-
kernel_3d_v_t k3v = kernel_3d_v(hydro->cs, lim);
849+
kernel_3d_v_t k3v = kernel_3d_v(hydro->cs, lim, NSIMDVL);
850850

851851
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
852852

@@ -883,7 +883,7 @@ __host__ int hydro_correct_momentum(hydro_t * hydro) {
883883
dim3 nblk = {};
884884
dim3 ntpb = {};
885885
cs_limits_t lim = {1, nlocal[X], 1, nlocal[Y], 1, nlocal[Z]};
886-
kernel_3d_v_t k3v = kernel_3d_v(hydro->cs, lim);
886+
kernel_3d_v_t k3v = kernel_3d_v(hydro->cs, lim, NSIMDVL);
887887

888888
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
889889

src/kernel_3d_v.c

+34-4
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,26 @@
2222
*
2323
* kernel_3d_v
2424
*
25+
* The request simd vector length nsimdvl is generally expected to
26+
* be the compile time NSIMDVL from memory.h.
27+
*
28+
* As nsimdvl only affects the starting position, it should not
29+
* have any adverse effect on the result (only the performance).
30+
*
2531
*****************************************************************************/
2632

27-
kernel_3d_v_t kernel_3d_v(cs_t * cs, cs_limits_t lim) {
33+
kernel_3d_v_t kernel_3d_v(cs_t * cs, cs_limits_t lim, int nsimdvl) {
2834

2935
kernel_3d_v_t k3v = (kernel_3d_v_t) {0};
3036
assert(cs);
37+
assert(nsimdvl > 0);
3138

3239
cs_nhalo(cs, &k3v.nhalo);
3340
cs_nlocal(cs, k3v.nlocal);
3441

3542
/* Limits as requested */
3643
k3v.lim = lim;
44+
k3v.nsimdvl = nsimdvl;
3745

3846
/* The kernel must execute a whole number of vector blocks, which
3947
* means we have to include the nhalo regions in (y, z). Points
@@ -44,20 +52,42 @@ kernel_3d_v_t kernel_3d_v(cs_t * cs, cs_limits_t lim) {
4452
lim.imin, lim.imax,
4553
1 - k3v.nhalo, k3v.nlocal[Y] + k3v.nhalo,
4654
1 - k3v.nhalo, k3v.nlocal[Z] + k3v.nhalo
47-
};
55+
};
4856

4957
k3v.nklocal[X] = klim.imax - klim.imin + 1;
5058
k3v.nklocal[Y] = klim.jmax - klim.jmin + 1;
5159
k3v.nklocal[Z] = klim.kmax - klim.kmin + 1;
5260

53-
/* Offset of first site must be start of a SIMD vector block */
61+
/* Offset of first site must be start of a SIMD vector block at
62+
* or below the starting point of the user-requested range. */
5463

5564
k3v.kindex0 = cs_index(cs, klim.imin, klim.jmin, klim.kmin);
56-
k3v.kindex0 = (k3v.kindex0/NSIMDVL)*NSIMDVL;
65+
k3v.kindex0 = (k3v.kindex0/nsimdvl)*nsimdvl;
5766

5867
/* Extent of the contiguous block ... */
5968
k3v.kiterations = k3v.nklocal[X]*k3v.nklocal[Y]*k3v.nklocal[Z];
6069
}
6170

6271
return k3v;
6372
}
73+
74+
/*****************************************************************************
75+
*
76+
* kernel_3d_v_exec_conf
77+
*
78+
* Return number of blocks, and threads per block.
79+
*
80+
*****************************************************************************/
81+
82+
int kernel_3d_v_exec_conf(const kernel_3d_v_t * k3v, dim3 * nblk, dim3 * ntpb) {
83+
84+
ntpb->x = tdp_get_max_threads();
85+
ntpb->y = 1;
86+
ntpb->z = 1;
87+
88+
nblk->x = (k3v->kiterations + ntpb->x - 1)/ntpb->x;
89+
nblk->y = 1;
90+
nblk->z = 1;
91+
92+
return 0;
93+
}

src/kernel_3d_v.h

+10-6
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,16 @@ struct kernel_3d_v_s {
2525
int nhalo; /* physical system - number of halo sites */
2626
int nlocal[3]; /* local system extent */
2727

28-
int kindex0; /* first index for kernel executtion */
28+
int kindex0; /* first index for kernel execution */
2929
int kiterations; /* Number of iterations required for kernel (1d) */
3030

3131
int nklocal[3]; /* local kernel extent */
3232
cs_limits_t lim; /* coordinate limits of the kernel (inclusive) */
33+
int nsimdvl; /* Requested vector length */
3334
};
3435

35-
kernel_3d_v_t kernel_3d_v(cs_t * cs, cs_limits_t lim);
36+
kernel_3d_v_t kernel_3d_v(cs_t * cs, cs_limits_t lim, int nsimdvl);
37+
int kernel_3d_v_exec_conf(const kernel_3d_v_t * k3v, dim3 * nblk, dim3 * ntpb);
3638

3739
/*****************************************************************************
3840
*
@@ -46,10 +48,13 @@ kernel_3d_v_t kernel_3d_v(cs_t * cs, cs_limits_t lim);
4648
*
4749
* kernel_3d_v_coords
4850
*
51+
* Note that the offset kindex0 gets added to the argument kindex here.
52+
* kindex if then the index from for_simt_parallel().
53+
*
4954
*****************************************************************************/
5055

5156
__host__ __device__ static inline void kernel_3d_v_coords(const kernel_3d_v_t * k3v,
52-
int kindex0,
57+
int kindex,
5358
int ic[NSIMDVL],
5459
int jc[NSIMDVL],
5560
int kc[NSIMDVL]) {
@@ -64,7 +69,7 @@ __host__ __device__ static inline void kernel_3d_v_coords(const kernel_3d_v_t *
6469
xs = k3v->nklocal[Y]*k3v->nklocal[Z];
6570

6671
for_simd_v(iv, NSIMDVL) {
67-
index = k3v->kindex0 + kindex0 + iv;
72+
index = k3v->kindex0 + kindex + iv;
6873

6974
icv[iv] = index/xs;
7075
jcv[iv] = (index - icv[iv]*xs)/k3v->nklocal[Z];
@@ -144,11 +149,10 @@ __host__ __device__ static inline void kernel_3d_v_cs_index(const kernel_3d_v_t
144149
xstr = ystr*(k3v->nlocal[Y] + 2*nh);
145150

146151
for_simd_v(iv, NSIMDVL) {
147-
index[iv] = xstr*(nh + icv[iv] - 1) + ystr*(nh + jcv[iv] - 1) + nh + kcv[iv] - 1;
152+
index[iv] = xstr*(nh + icv[iv] - 1) + ystr*(nh + jcv[iv] - 1) + nh + kcv[iv] - 1;
148153
}
149154

150155
return;
151156
}
152157

153158
#endif
154-

src/phi_force_colloid.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ __host__ int pth_force_fluid_driver(pth_t * pth, hydro_t * hydro) {
284284
dim3 nblk = {};
285285
dim3 ntpb = {};
286286
cs_limits_t lim = {1, nlocal[X], 1, nlocal[Y], 1, nlocal[Z]};
287-
kernel_3d_v_t k3v = kernel_3d_v(pth->cs, lim);
287+
kernel_3d_v_t k3v = kernel_3d_v(pth->cs, lim, NSIMDVL);
288288

289289
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
290290

src/phi_force_stress.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ __host__ int pth_stress_compute(pth_t * pth, fe_t * fe) {
189189
.jmin = 1 - nextra, .jmax = nlocal[Y] + nextra,
190190
.kmin = 1 - nextra, .kmax = nlocal[Z] + nextra
191191
};
192-
kernel_3d_v_t k3v = kernel_3d_v(pth->cs, lim);
192+
kernel_3d_v_t k3v = kernel_3d_v(pth->cs, lim, NSIMDVL);
193193

194194
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
195195

src/propagation.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ __host__ int lb_propagation_driver(lb_t * lb) {
7575
dim3 nblk = {};
7676
dim3 ntpb = {};
7777
cs_limits_t lim = {1, nlocal[X], 1, nlocal[Y], 1, nlocal[Z]};
78-
kernel_3d_v_t k3v = kernel_3d_v(lb->cs, lim);
78+
kernel_3d_v_t k3v = kernel_3d_v(lb->cs, lim, NSIMDVL);
7979

8080
kernel_3d_launch_param(k3v.kiterations, &nblk, &ntpb);
8181

0 commit comments

Comments
 (0)