@@ -235,77 +235,37 @@ This performs:
235
235
cudaEventElapsedTime (&milliseconds, start, stop);
236
236
printf (" [time ] \t CUFFT Plan\t\t %.3g s\n " , milliseconds/1000 );
237
237
#endif
238
- CNTime timer;
239
- if (max (nf1, max (nf2, nf3)) < 2e3 ) {
240
- timer.start ();
241
- FLT *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
242
-
243
- fwkerhalf1 = (FLT*)malloc (sizeof (FLT)*(nf1/2 +1 ));
244
- onedim_fseries_kernel (nf1, fwkerhalf1, d_plan->spopts );
245
- if (dim > 1 ){
246
- fwkerhalf2 = (FLT*)malloc (sizeof (FLT)*(nf2/2 +1 ));
247
- onedim_fseries_kernel (nf2, fwkerhalf2, d_plan->spopts );
248
- }
249
- if (dim > 2 ){
250
- fwkerhalf3 = (FLT*)malloc (sizeof (FLT)*(nf3/2 +1 ));
251
- onedim_fseries_kernel (nf3, fwkerhalf3, d_plan->spopts );
252
- }
253
- #ifdef TIME
254
- printf (" [time ] \t kernel fser (on CPU):\t %.3g s\n " , timer.elapsedsec ());
255
- #endif
256
- cudaEventRecord (start);
257
- checkCudaErrors (cudaMemcpy (d_plan->fwkerhalf1 ,fwkerhalf1,(nf1/2 +1 )*
258
- sizeof (FLT),cudaMemcpyHostToDevice));
259
- if (dim > 1 )
260
- checkCudaErrors (cudaMemcpy (d_plan->fwkerhalf2 ,fwkerhalf2,(nf2/2 +1 )*
261
- sizeof (FLT),cudaMemcpyHostToDevice));
262
- if (dim > 2 )
263
- checkCudaErrors (cudaMemcpy (d_plan->fwkerhalf3 ,fwkerhalf3,(nf3/2 +1 )*
264
- sizeof (FLT),cudaMemcpyHostToDevice));
265
- #ifdef TIME
266
- cudaEventRecord (stop);
267
- cudaEventSynchronize (stop);
268
- cudaEventElapsedTime (&milliseconds, start, stop);
269
- printf (" [time ] \t Copy fwkerhalf HtoD\t %.3g s\n " , milliseconds/1000 );
270
- #endif
271
- free (fwkerhalf1);
272
- if (dim > 1 )
273
- free (fwkerhalf2);
274
- if (dim > 2 )
275
- free (fwkerhalf3);
276
- } else {
277
- timer.start ();
278
- complex<double > a[3 *MAX_NQUAD];
279
- FLT f[3 *MAX_NQUAD];
280
- onedim_fseries_kernel_1sthalf (nf1, f, a, d_plan->spopts );
281
- if (dim > 1 ){
282
- onedim_fseries_kernel_1sthalf (nf2, f+MAX_NQUAD, a+MAX_NQUAD, d_plan->spopts );
283
- }
284
- if (dim > 2 ){
285
- onedim_fseries_kernel_1sthalf (nf3, f+2 *MAX_NQUAD, a+2 *MAX_NQUAD, d_plan->spopts );
286
- }
238
+ CNTime timer; timer.start ();
239
+ complex<double > a[3 *MAX_NQUAD];
240
+ FLT f[3 *MAX_NQUAD];
241
+ onedim_fseries_kernel_precomp (nf1, f, a, d_plan->spopts );
242
+ if (dim > 1 ){
243
+ onedim_fseries_kernel_precomp (nf2, f+MAX_NQUAD, a+MAX_NQUAD, d_plan->spopts );
244
+ }
245
+ if (dim > 2 ){
246
+ onedim_fseries_kernel_precomp (nf3, f+2 *MAX_NQUAD, a+2 *MAX_NQUAD, d_plan->spopts );
247
+ }
287
248
#ifdef TIME
288
- printf (" [time ] \t kernel fser (1st half on CPU):\t %.3g s\n " , timer.elapsedsec ());
249
+ printf (" [time ] \t kernel fser (1st half on CPU):\t %.3g s\n " , timer.elapsedsec ());
289
250
#endif
290
251
291
- cudaEventRecord (start);
292
- cuDoubleComplex *d_a;
293
- FLT *d_f;
294
- checkCudaErrors (cudaMalloc (&d_a, dim*MAX_NQUAD*sizeof (cuDoubleComplex)));
295
- checkCudaErrors (cudaMalloc (&d_f, dim*MAX_NQUAD*sizeof (FLT)));
296
- checkCudaErrors (cudaMemcpy (d_a,a,dim*MAX_NQUAD*sizeof (cuDoubleComplex),cudaMemcpyHostToDevice));
297
- checkCudaErrors (cudaMemcpy (d_f,f,dim*MAX_NQUAD*sizeof (FLT),cudaMemcpyHostToDevice));
298
- ier = CUONEDIMFSERIESKERNEL (d_plan->dim , nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1 ,
299
- d_plan->fwkerhalf2 , d_plan->fwkerhalf3 , d_plan->spopts .nspread );
252
+ cudaEventRecord (start);
253
+ cuDoubleComplex *d_a;
254
+ FLT *d_f;
255
+ checkCudaErrors (cudaMalloc (&d_a, dim*MAX_NQUAD*sizeof (cuDoubleComplex)));
256
+ checkCudaErrors (cudaMalloc (&d_f, dim*MAX_NQUAD*sizeof (FLT)));
257
+ checkCudaErrors (cudaMemcpy (d_a,a,dim*MAX_NQUAD*sizeof (cuDoubleComplex),cudaMemcpyHostToDevice));
258
+ checkCudaErrors (cudaMemcpy (d_f,f,dim*MAX_NQUAD*sizeof (FLT),cudaMemcpyHostToDevice));
259
+ ier = CUFSERIESKERNELCOMPUTE (d_plan->dim , nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1 ,
260
+ d_plan->fwkerhalf2 , d_plan->fwkerhalf3 , d_plan->spopts .nspread );
300
261
#ifdef TIME
301
- cudaEventRecord (stop);
302
- cudaEventSynchronize (stop);
303
- cudaEventElapsedTime (&milliseconds, start, stop);
304
- printf (" [time ] \t kernel fser (2nd half on GPU)\t %.3g s\n " , milliseconds/1000 );
262
+ cudaEventRecord (stop);
263
+ cudaEventSynchronize (stop);
264
+ cudaEventElapsedTime (&milliseconds, start, stop);
265
+ printf (" [time ] \t kernel fser (2nd half on GPU)\t %.3g s\n " , milliseconds/1000 );
305
266
#endif
306
- cudaFree (d_a);
307
- cudaFree (d_f);
308
- }
267
+ cudaFree (d_a);
268
+ cudaFree (d_f);
309
269
// Multi-GPU support: reset the device ID
310
270
cudaSetDevice (orig_gpu_device_id);
311
271
0 commit comments