Skip to content

Commit 3d2b390

Browse files
committed
func. rename; clean up makefile&some unused codes; always using GPU version of fseriesker compute
1 parent d6eaa16 commit 3d2b390

10 files changed

+87
-140
lines changed

Makefile

+4-2
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ CUFINUFFTOBJS_32=$(CUFINUFFTOBJS_64:%.o=%_32.o)
116116
default: all
117117

118118
# Build all, but run no tests. Note: CI currently uses this default...
119-
all: libtest spreadtest examples fserieskertest
119+
all: libtest internaltest examples
120+
121+
internaltest: spreadtest fserieskertest
120122

121123
# testers for the lib (does not execute)
122124
libtest: lib $(BINDIR)/cufinufft2d1_test \
@@ -138,7 +140,7 @@ libtest: lib $(BINDIR)/cufinufft2d1_test \
138140
$(BINDIR)/cufinufft1d1_test \
139141
$(BINDIR)/cufinufft1d2_test \
140142
$(BINDIR)/cufinufft1d1_test_32 \
141-
$(BINDIR)/cufinufft1d2_test_32 \
143+
$(BINDIR)/cufinufft1d2_test_32
142144

143145
# low-level (not-library) testers (does not execute)
144146
spreadtest: $(BINDIR)/spread2d_test \

contrib/common.cpp

+19-36
Original file line numberDiff line numberDiff line change
@@ -59,44 +59,29 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, SPREAD_OPTS opts)
5959
sampled kernel, not quite the same object.
6060
6161
Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18
62+
Melody 2/20/22 separate into precomp & comp functions defined below.
6263
*/
6364
{
64-
FLT J2 = opts.nspread/2.0; // J/2, half-width of ker z-support
65-
// # quadr nodes in z (from 0 to J/2; reflections will be added)...
66-
int q=(int)(2 + 3.0*J2); // not sure why so large? cannot exceed MAX_NQUAD
67-
FLT f[MAX_NQUAD]; double z[2*MAX_NQUAD],w[2*MAX_NQUAD];
68-
legendre_compute_glr(2*q,z,w); // only half the nodes used, eg on (0,1)
65+
FLT f[MAX_NQUAD];
6966
dcomplex a[MAX_NQUAD];
70-
for (int n=0;n<q;++n) { // set up nodes z_n and vals f_n
71-
z[n] *= J2; // rescale nodes
72-
f[n] = J2*(FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei
73-
a[n] = exp(2*PI*IMA*(FLT)(nf/2-z[n])/(FLT)nf); // phase winding rates
74-
}
75-
BIGINT nout=nf/2+1; // how many values we're writing to
76-
int nt = MIN(nout,MY_OMP_GET_MAX_THREADS()); // how many chunks
77-
std::vector<BIGINT> brk(nt+1); // start indices for each thread
78-
for (int t=0; t<=nt; ++t) // split nout mode indices btw threads
79-
brk[t] = (BIGINT)(0.5 + nout*t/(double)nt);
80-
#pragma omp parallel
81-
{
82-
int t = MY_OMP_GET_THREAD_NUM();
83-
if (t<nt) { // could be nt < actual # threads
84-
dcomplex aj[MAX_NQUAD]; // phase rotator for this thread
85-
for (int n=0;n<q;++n)
86-
aj[n] = pow(a[n],(FLT)brk[t]); // init phase factors for chunk
87-
for (BIGINT j=brk[t];j<brk[t+1];++j) { // loop along output array
88-
FLT x = 0.0; // accumulator for answer at this j
89-
for (int n=0;n<q;++n) {
90-
x += f[n] * 2*real(aj[n]); // include the negative freq
91-
aj[n] *= a[n]; // wind the phases
92-
}
93-
fwkerhalf[j] = x;
94-
}
95-
}
96-
}
67+
onedim_fseries_kernel_precomp(nf, f, a, opts);
68+
onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts);
9769
}
9870

99-
void onedim_fseries_kernel_1sthalf(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts)
71+
/*
72+
Precomputation of approximations of exact Fourier series coeffs of cnufftspread's
73+
real symmetric kernel.
74+
75+
Inputs:
76+
nf - size of 1d uniform spread grid, must be even.
77+
opts - spreading opts object, needed to eval kernel (must be already set up)
78+
79+
Outputs:
80+
a - phase winding rates
81+
f - funciton values at quadrature nodes multiplied with quadrature weights
82+
(a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below)
83+
*/
84+
void onedim_fseries_kernel_precomp(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts)
10085
{
10186
FLT J2 = opts.nspread/2.0; // J/2, half-width of ker z-support
10287
// # quadr nodes in z (from 0 to J/2; reflections will be added)...
@@ -110,8 +95,7 @@ void onedim_fseries_kernel_1sthalf(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS o
11095
}
11196
}
11297

113-
#if 0
114-
void onedim_fseries_kernel_2ndhalf(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts)
98+
void onedim_fseries_kernel_compute(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts)
11599
{
116100
FLT J2 = opts.nspread/2.0; // J/2, half-width of ker z-support
117101
int q=(int)(2 + 3.0*J2); // not sure why so large? cannot exceed MAX_NQUAD
@@ -138,4 +122,3 @@ void onedim_fseries_kernel_2ndhalf(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhal
138122
}
139123
}
140124
}
141-
#endif

contrib/common.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,6 @@ int setup_spreader_for_nufft(SPREAD_OPTS &spopts, FLT eps, cufinufft_opts opts);
2020
void SET_NF_TYPE12(BIGINT ms, cufinufft_opts opts, SPREAD_OPTS spopts,BIGINT *nf,
2121
BIGINT b);
2222
void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, SPREAD_OPTS opts);
23-
void onedim_fseries_kernel_1sthalf(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts);
24-
//void onedim_fseries_kernel_2ndhalf(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts);
23+
void onedim_fseries_kernel_precomp(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts);
24+
void onedim_fseries_kernel_compute(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts);
2525
#endif // COMMON_H

include/cufinufft_eitherprec.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
#undef CUFINUFFT_PLAN_S
9797
#undef CUFINUFFT_PLAN
9898
/* fseries kernel */
99-
#undef CUONEDIMFSERIESKERNEL
99+
#undef CUFSERIESKERNELCOMPUTE
100100

101101
#ifdef SINGLE
102102

@@ -168,7 +168,7 @@
168168
#define CUFINUFFT_PLAN_S cufinufftf_plan_s
169169
#define CUFINUFFT_PLAN cufinufftf_plan
170170
/* fseries kernel */
171-
#define CUONEDIMFSERIESKERNEL cuonedimfserieskernel_f
171+
#define CUFSERIESKERNELCOMPUTE cufserieskernelcompute_f
172172

173173
#else
174174

@@ -240,7 +240,7 @@
240240
#define CUFINUFFT_PLAN_S cufinufft_plan_s
241241
#define CUFINUFFT_PLAN cufinufft_plan
242242
/* fseries kernel */
243-
#define CUONEDIMFSERIESKERNEL cuonedimfserieskernel
243+
#define CUFSERIESKERNELCOMPUTE cufserieskernelcompute
244244

245245
#endif
246246

src/common.cu

+17-10
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,20 @@
99

1010
using namespace std;
1111

12-
/* TODO */
12+
/* Kernel for computing approximations of exact Fourier series coeffs of
13+
cnufftspread's real symmetric kernel. */
14+
// a , f are intermediate results from function onedim_fseries_kernel_precomp()
15+
// (see cufinufft/contrib/common.cpp for description)
1316
__global__
14-
void OnedimFseriesKernel(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns)
17+
void FseriesKernelCompute(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a,
18+
FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns)
1519
{
1620
FLT J2 = ns/2.0;
1721
int q=(int)(2 + 3.0*J2);
1822
int nf;
19-
//cuDoubleComplex aj[MAX_NQUAD];
2023
cuDoubleComplex *at = a + threadIdx.y*MAX_NQUAD;
2124
FLT *ft = f + threadIdx.y*MAX_NQUAD;
22-
FLT *oarr;
25+
FLT *oarr;
2326
if (threadIdx.y == 0){
2427
oarr = fwkerhalf1;
2528
nf = nf1;
@@ -41,18 +44,22 @@ void OnedimFseriesKernel(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a,
4144
}
4245
}
4346

44-
int CUONEDIMFSERIESKERNEL(int dim, int nf1, int nf2, int nf3, FLT *d_f, cuDoubleComplex *d_a,
45-
FLT *d_fwkerhalf1, FLT *d_fwkerhalf2, FLT *d_fwkerhalf3, int ns)
46-
/*
47-
TODO
47+
int CUFSERIESKERNELCOMPUTE(int dim, int nf1, int nf2, int nf3, FLT *d_f,
48+
cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2,
49+
FLT *d_fwkerhalf3, int ns)
50+
/*
51+
wrapper for approximation of Fourier series of real symmetric spreading
52+
kernel.
53+
54+
Melody Shih 2/20/22
4855
*/
4956
{
5057
int nout = max(max(nf1/2+1,nf2/2+1),nf3/2+1);
5158

5259
dim3 threadsPerBlock(16, dim);
5360
dim3 numBlocks((nout+16-1)/16, 1);
5461

55-
OnedimFseriesKernel<<<numBlocks, threadsPerBlock>>>(nf1, nf2, nf3, d_f, d_a,
56-
d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
62+
FseriesKernelCompute<<<numBlocks, threadsPerBlock>>>(nf1, nf2, nf3, d_f,
63+
d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
5764
return 0;
5865
}

src/common.h

+2-5
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,8 @@
22
#define __COMMON_H__
33
#include <cufinufft_eitherprec.h>
44

5-
//__global__
6-
//void OnedimFseriesKernel(int nf, FLT *f, cuDoubleComplex* a, FLT *fwkerhalf, int ns);
75
__global__
8-
void OnedimFseriesKernel(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns);
6+
void FseriesKernelCompute(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns);
97

10-
//int CUONEDIMFSERIESKERNEL(int nf, FLT *d_f, cuDoubleComplex* d_a, FLT *d_fwkerhalf, int ns);
11-
int CUONEDIMFSERIESKERNEL(int dim, int nf1, int nf2, int nf3, FLT *d_f, cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2, FLT *d_fwkerhalf3, int ns);
8+
int CUFSERIESKERNELCOMPUTE(int dim, int nf1, int nf2, int nf3, FLT *d_f, cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2, FLT *d_fwkerhalf3, int ns);
129
#endif

src/cufinufft.cu

+26-66
Original file line numberDiff line numberDiff line change
@@ -235,77 +235,37 @@ This performs:
235235
cudaEventElapsedTime(&milliseconds, start, stop);
236236
printf("[time ] \tCUFFT Plan\t\t %.3g s\n", milliseconds/1000);
237237
#endif
238-
CNTime timer;
239-
if (max(nf1, max(nf2, nf3)) < 2e3) {
240-
timer.start();
241-
FLT *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
242-
243-
fwkerhalf1 = (FLT*)malloc(sizeof(FLT)*(nf1/2+1));
244-
onedim_fseries_kernel(nf1, fwkerhalf1, d_plan->spopts);
245-
if(dim > 1){
246-
fwkerhalf2 = (FLT*)malloc(sizeof(FLT)*(nf2/2+1));
247-
onedim_fseries_kernel(nf2, fwkerhalf2, d_plan->spopts);
248-
}
249-
if(dim > 2){
250-
fwkerhalf3 = (FLT*)malloc(sizeof(FLT)*(nf3/2+1));
251-
onedim_fseries_kernel(nf3, fwkerhalf3, d_plan->spopts);
252-
}
253-
#ifdef TIME
254-
printf("[time ] \tkernel fser (on CPU):\t %.3g s\n", timer.elapsedsec());
255-
#endif
256-
cudaEventRecord(start);
257-
checkCudaErrors(cudaMemcpy(d_plan->fwkerhalf1,fwkerhalf1,(nf1/2+1)*
258-
sizeof(FLT),cudaMemcpyHostToDevice));
259-
if(dim > 1)
260-
checkCudaErrors(cudaMemcpy(d_plan->fwkerhalf2,fwkerhalf2,(nf2/2+1)*
261-
sizeof(FLT),cudaMemcpyHostToDevice));
262-
if(dim > 2)
263-
checkCudaErrors(cudaMemcpy(d_plan->fwkerhalf3,fwkerhalf3,(nf3/2+1)*
264-
sizeof(FLT),cudaMemcpyHostToDevice));
265-
#ifdef TIME
266-
cudaEventRecord(stop);
267-
cudaEventSynchronize(stop);
268-
cudaEventElapsedTime(&milliseconds, start, stop);
269-
printf("[time ] \tCopy fwkerhalf HtoD\t %.3g s\n", milliseconds/1000);
270-
#endif
271-
free(fwkerhalf1);
272-
if(dim > 1)
273-
free(fwkerhalf2);
274-
if(dim > 2)
275-
free(fwkerhalf3);
276-
} else {
277-
timer.start();
278-
complex<double> a[3*MAX_NQUAD];
279-
FLT f[3*MAX_NQUAD];
280-
onedim_fseries_kernel_1sthalf(nf1, f, a, d_plan->spopts);
281-
if(dim > 1){
282-
onedim_fseries_kernel_1sthalf(nf2, f+MAX_NQUAD, a+MAX_NQUAD, d_plan->spopts);
283-
}
284-
if(dim > 2){
285-
onedim_fseries_kernel_1sthalf(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, d_plan->spopts);
286-
}
238+
CNTime timer; timer.start();
239+
complex<double> a[3*MAX_NQUAD];
240+
FLT f[3*MAX_NQUAD];
241+
onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
242+
if(dim > 1){
243+
onedim_fseries_kernel_precomp(nf2, f+MAX_NQUAD, a+MAX_NQUAD, d_plan->spopts);
244+
}
245+
if(dim > 2){
246+
onedim_fseries_kernel_precomp(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, d_plan->spopts);
247+
}
287248
#ifdef TIME
288-
printf("[time ] \tkernel fser (1st half on CPU):\t %.3g s\n", timer.elapsedsec());
249+
printf("[time ] \tkernel fser (1st half on CPU):\t %.3g s\n", timer.elapsedsec());
289250
#endif
290251

291-
cudaEventRecord(start);
292-
cuDoubleComplex *d_a;
293-
FLT *d_f;
294-
checkCudaErrors(cudaMalloc(&d_a, dim*MAX_NQUAD*sizeof(cuDoubleComplex)));
295-
checkCudaErrors(cudaMalloc(&d_f, dim*MAX_NQUAD*sizeof(FLT)));
296-
checkCudaErrors(cudaMemcpy(d_a,a,dim*MAX_NQUAD*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
297-
checkCudaErrors(cudaMemcpy(d_f,f,dim*MAX_NQUAD*sizeof(FLT),cudaMemcpyHostToDevice));
298-
ier = CUONEDIMFSERIESKERNEL(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1,
299-
d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->spopts.nspread);
252+
cudaEventRecord(start);
253+
cuDoubleComplex *d_a;
254+
FLT *d_f;
255+
checkCudaErrors(cudaMalloc(&d_a, dim*MAX_NQUAD*sizeof(cuDoubleComplex)));
256+
checkCudaErrors(cudaMalloc(&d_f, dim*MAX_NQUAD*sizeof(FLT)));
257+
checkCudaErrors(cudaMemcpy(d_a,a,dim*MAX_NQUAD*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
258+
checkCudaErrors(cudaMemcpy(d_f,f,dim*MAX_NQUAD*sizeof(FLT),cudaMemcpyHostToDevice));
259+
ier = CUFSERIESKERNELCOMPUTE(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1,
260+
d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->spopts.nspread);
300261
#ifdef TIME
301-
cudaEventRecord(stop);
302-
cudaEventSynchronize(stop);
303-
cudaEventElapsedTime(&milliseconds, start, stop);
304-
printf("[time ] \tkernel fser (2nd half on GPU)\t %.3g s\n", milliseconds/1000);
262+
cudaEventRecord(stop);
263+
cudaEventSynchronize(stop);
264+
cudaEventElapsedTime(&milliseconds, start, stop);
265+
printf("[time ] \tkernel fser (2nd half on GPU)\t %.3g s\n", milliseconds/1000);
305266
#endif
306-
cudaFree(d_a);
307-
cudaFree(d_f);
308-
}
267+
cudaFree(d_a);
268+
cudaFree(d_f);
309269
// Multi-GPU support: reset the device ID
310270
cudaSetDevice(orig_gpu_device_id);
311271

src/precision_independent.cu

-4
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@
1212
/* Auxiliary func to compute power of complex number */
1313
__device__ RT carg(const CT& z) {return (RT)atan2(ipart(z), rpart(z));} // polar angle
1414
__device__ RT cabs(const CT& z) {return (RT)cuCabs(z);}
15-
__device__ CT cpow(const CT& z, const int &n) {
16-
RT abs_z_n = pow(cabs(z), n);
17-
return cmplx(abs_z_n*cos(n*carg(z)), abs_z_n*sin(n*carg(z)));
18-
}
1915

2016
/* Common Kernels from spreadinterp3d */
2117
__host__ __device__

test/fseries_kernel_test.cu

+12-10
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,11 @@ int main(int argc, char* argv[])
106106
timer.start();
107107
complex<double> a[dim*MAX_NQUAD];
108108
FLT f[dim*MAX_NQUAD];
109-
onedim_fseries_kernel_1sthalf(nf1, f, a, opts);
109+
onedim_fseries_kernel_precomp(nf1, f, a, opts);
110110
if(dim > 1)
111-
onedim_fseries_kernel_1sthalf(nf2, f+MAX_NQUAD, a+MAX_NQUAD, opts);
111+
onedim_fseries_kernel_precomp(nf2, f+MAX_NQUAD, a+MAX_NQUAD, opts);
112112
if(dim > 2)
113-
onedim_fseries_kernel_1sthalf(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, opts);
113+
onedim_fseries_kernel_precomp(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, opts);
114114
cputime = timer.elapsedsec();
115115

116116
cuDoubleComplex *d_a;
@@ -123,7 +123,7 @@ int main(int argc, char* argv[])
123123
dim*MAX_NQUAD*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
124124
checkCudaErrors(cudaMemcpy(d_f,f,
125125
dim*MAX_NQUAD*sizeof(FLT),cudaMemcpyHostToDevice));
126-
ier = CUONEDIMFSERIESKERNEL(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1,
126+
ier = CUFSERIESKERNELCOMPUTE(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1,
127127
d_fwkerhalf2, d_fwkerhalf3, opts.nspread);
128128
}
129129
cudaEventRecord(stop);
@@ -151,12 +151,14 @@ int main(int argc, char* argv[])
151151
for(int i=0; i<nf1/2+1; i++)
152152
printf("%10.8e ", fwkerhalf1[i]);
153153
printf("\n");
154-
for(int i=0; i<nf2/2+1; i++)
155-
printf("%10.8e ", fwkerhalf2[i]);
156-
printf("\n");
157-
for(int i=0; i<nf3/2+1; i++)
158-
printf("%10.8e ", fwkerhalf3[i]);
159-
printf("\n");
154+
if(dim > 1)
155+
for(int i=0; i<nf2/2+1; i++)
156+
printf("%10.8e ", fwkerhalf2[i]);
157+
printf("\n");
158+
if(dim > 2)
159+
for(int i=0; i<nf3/2+1; i++)
160+
printf("%10.8e ", fwkerhalf3[i]);
161+
printf("\n");
160162
#endif
161163

162164
return 0;

test/fseriesperf.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
2-
# basic perf test of spread/interp for 2/3d, single/double
3-
# Barnett 1/29/21, some 1D added 12/2/21.
2+
# basic perf test of compute fseries for 1d, single/double
3+
# Melody 02/20/22
44

55
BIN=../bin/fseries_kernel_test
66
DIM=1

0 commit comments

Comments
 (0)