22
22
*
23
23
* kernel_3d_v
24
24
*
25
+ * The request simd vector length nsimdvl is generally expected to
26
+ * be the compile time NSIMDVL from memory.h.
27
+ *
28
+ * As nsimdvl only affects the starting position, it should not
29
+ * have any adverse effect on the result (only the performance).
30
+ *
25
31
*****************************************************************************/
26
32
27
- kernel_3d_v_t kernel_3d_v (cs_t * cs , cs_limits_t lim ) {
33
+ kernel_3d_v_t kernel_3d_v (cs_t * cs , cs_limits_t lim , int nsimdvl ) {
28
34
29
35
kernel_3d_v_t k3v = (kernel_3d_v_t ) {0 };
30
36
assert (cs );
37
+ assert (nsimdvl > 0 );
31
38
32
39
cs_nhalo (cs , & k3v .nhalo );
33
40
cs_nlocal (cs , k3v .nlocal );
34
41
35
42
/* Limits as requested */
36
43
k3v .lim = lim ;
44
+ k3v .nsimdvl = nsimdvl ;
37
45
38
46
/* The kernel must execute a whole number of vector blocks, which
39
47
* means we have to include the nhalo regions in (y, z). Points
@@ -44,20 +52,42 @@ kernel_3d_v_t kernel_3d_v(cs_t * cs, cs_limits_t lim) {
44
52
lim .imin , lim .imax ,
45
53
1 - k3v .nhalo , k3v .nlocal [Y ] + k3v .nhalo ,
46
54
1 - k3v .nhalo , k3v .nlocal [Z ] + k3v .nhalo
47
- };
55
+ };
48
56
49
57
k3v .nklocal [X ] = klim .imax - klim .imin + 1 ;
50
58
k3v .nklocal [Y ] = klim .jmax - klim .jmin + 1 ;
51
59
k3v .nklocal [Z ] = klim .kmax - klim .kmin + 1 ;
52
60
53
- /* Offset of first site must be start of a SIMD vector block */
61
+ /* Offset of first site must be start of a SIMD vector block at
62
+ * or below the starting point of the user-requested range. */
54
63
55
64
k3v .kindex0 = cs_index (cs , klim .imin , klim .jmin , klim .kmin );
56
- k3v .kindex0 = (k3v .kindex0 /NSIMDVL ) * NSIMDVL ;
65
+ k3v .kindex0 = (k3v .kindex0 /nsimdvl ) * nsimdvl ;
57
66
58
67
/* Extent of the contiguous block ... */
59
68
k3v .kiterations = k3v .nklocal [X ]* k3v .nklocal [Y ]* k3v .nklocal [Z ];
60
69
}
61
70
62
71
return k3v ;
63
72
}
73
+
74
+ /*****************************************************************************
75
+ *
76
+ * kernel_3d_v_exec_conf
77
+ *
78
+ * Return number of blocks, and threads per block.
79
+ *
80
+ *****************************************************************************/
81
+
82
+ int kernel_3d_v_exec_conf (const kernel_3d_v_t * k3v , dim3 * nblk , dim3 * ntpb ) {
83
+
84
+ ntpb -> x = tdp_get_max_threads ();
85
+ ntpb -> y = 1 ;
86
+ ntpb -> z = 1 ;
87
+
88
+ nblk -> x = (k3v -> kiterations + ntpb -> x - 1 )/ntpb -> x ;
89
+ nblk -> y = 1 ;
90
+ nblk -> z = 1 ;
91
+
92
+ return 0 ;
93
+ }
0 commit comments