func. rename; clean up makefile&some unused codes; always using GPU version of fseriesker compute

MelodyShih · MelodyShih · commit 3d2b3904be6f · 2022-02-20T12:08:54.000-05:00
diff --git a/Makefile b/Makefile
@@ -116,7 +116,9 @@ CUFINUFFTOBJS_32=$(CUFINUFFTOBJS_64:%.o=%_32.o)
 default: all
 
 # Build all, but run no tests. Note: CI currently uses this default...
-all: libtest spreadtest examples fserieskertest
+all: libtest internaltest examples
+
+internaltest: spreadtest fserieskertest
 
 # testers for the lib (does not execute)
 libtest: lib $(BINDIR)/cufinufft2d1_test \
@@ -138,7 +140,7 @@ libtest: lib $(BINDIR)/cufinufft2d1_test \
 	$(BINDIR)/cufinufft1d1_test \
 	$(BINDIR)/cufinufft1d2_test \
 	$(BINDIR)/cufinufft1d1_test_32 \
-	$(BINDIR)/cufinufft1d2_test_32 \
+	$(BINDIR)/cufinufft1d2_test_32
 
 # low-level (not-library) testers (does not execute)
 spreadtest: $(BINDIR)/spread2d_test \
diff --git a/contrib/common.cpp b/contrib/common.cpp
@@ -59,44 +59,29 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, SPREAD_OPTS opts)
   sampled kernel, not quite the same object.
 
   Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18
+  Melody 2/20/22 separate into precomp & comp functions defined below.
  */
 {
-  FLT J2 = opts.nspread/2.0;            // J/2, half-width of ker z-support
-  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q=(int)(2 + 3.0*J2);  // not sure why so large? cannot exceed MAX_NQUAD
-  FLT f[MAX_NQUAD]; double z[2*MAX_NQUAD],w[2*MAX_NQUAD];
-  legendre_compute_glr(2*q,z,w);        // only half the nodes used, eg on (0,1)
+  FLT f[MAX_NQUAD];
   dcomplex a[MAX_NQUAD];
-  for (int n=0;n<q;++n) {               // set up nodes z_n and vals f_n
-    z[n] *= J2;                         // rescale nodes
-    f[n] = J2*(FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei
-    a[n] = exp(2*PI*IMA*(FLT)(nf/2-z[n])/(FLT)nf);  // phase winding rates
-  }
-  BIGINT nout=nf/2+1;                   // how many values we're writing to
-  int nt = MIN(nout,MY_OMP_GET_MAX_THREADS());  // how many chunks
-  std::vector<BIGINT> brk(nt+1);        // start indices for each thread
-  for (int t=0; t<=nt; ++t)             // split nout mode indices btw threads
-    brk[t] = (BIGINT)(0.5 + nout*t/(double)nt);
-#pragma omp parallel
-  {
-    int t = MY_OMP_GET_THREAD_NUM();
-    if (t<nt) {                         // could be nt < actual # threads
-      dcomplex aj[MAX_NQUAD];           // phase rotator for this thread
-      for (int n=0;n<q;++n)
-	aj[n] = pow(a[n],(FLT)brk[t]);       // init phase factors for chunk
-      for (BIGINT j=brk[t];j<brk[t+1];++j) {       // loop along output array
-	FLT x = 0.0;                       // accumulator for answer at this j
-	for (int n=0;n<q;++n) {
-	  x += f[n] * 2*real(aj[n]);       // include the negative freq
-	  aj[n] *= a[n];                   // wind the phases
-	}
-	fwkerhalf[j] = x;
-      }
-    }
-  }
+  onedim_fseries_kernel_precomp(nf, f, a, opts);
+  onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts);
 }
 
-void onedim_fseries_kernel_1sthalf(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts)
+/*
+  Precomputation of approximations of exact Fourier series coeffs of cnufftspread's
+  real symmetric kernel.
+
+  Inputs:
+  nf - size of 1d uniform spread grid, must be even.
+  opts - spreading opts object, needed to eval kernel (must be already set up)
+
+  Outputs:
+  a - phase winding rates
+  f - funciton values at quadrature nodes multiplied with quadrature weights
+  (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below)
+*/
+void onedim_fseries_kernel_precomp(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts)
 {
   FLT J2 = opts.nspread/2.0;            // J/2, half-width of ker z-support
   // # quadr nodes in z (from 0 to J/2; reflections will be added)...
@@ -110,8 +95,7 @@ void onedim_fseries_kernel_1sthalf(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS o
   }
 }
 
-#if 0
-void onedim_fseries_kernel_2ndhalf(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts)
+void onedim_fseries_kernel_compute(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts)
 {
   FLT J2 = opts.nspread/2.0;            // J/2, half-width of ker z-support
   int q=(int)(2 + 3.0*J2);  // not sure why so large? cannot exceed MAX_NQUAD
@@ -138,4 +122,3 @@ void onedim_fseries_kernel_2ndhalf(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhal
     }
   }
 }
-#endif
diff --git a/contrib/common.h b/contrib/common.h
@@ -20,6 +20,6 @@ int setup_spreader_for_nufft(SPREAD_OPTS &spopts, FLT eps, cufinufft_opts opts);
 void SET_NF_TYPE12(BIGINT ms, cufinufft_opts opts, SPREAD_OPTS spopts,BIGINT *nf,
                    BIGINT b);
 void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, SPREAD_OPTS opts);
-void onedim_fseries_kernel_1sthalf(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts);
-//void onedim_fseries_kernel_2ndhalf(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts);
+void onedim_fseries_kernel_precomp(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts);
+void onedim_fseries_kernel_compute(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts);
 #endif  // COMMON_H
diff --git a/include/cufinufft_eitherprec.h b/include/cufinufft_eitherprec.h
@@ -96,7 +96,7 @@
 #undef CUFINUFFT_PLAN_S
 #undef CUFINUFFT_PLAN
 /* fseries kernel */
-#undef CUONEDIMFSERIESKERNEL
+#undef CUFSERIESKERNELCOMPUTE
 
 #ifdef SINGLE
 
@@ -168,7 +168,7 @@
 #define CUFINUFFT_PLAN_S cufinufftf_plan_s
 #define CUFINUFFT_PLAN cufinufftf_plan
 /* fseries kernel */
-#define CUONEDIMFSERIESKERNEL cuonedimfserieskernel_f
+#define CUFSERIESKERNELCOMPUTE cufserieskernelcompute_f
 
 #else
 
@@ -240,7 +240,7 @@
 #define CUFINUFFT_PLAN_S cufinufft_plan_s
 #define CUFINUFFT_PLAN cufinufft_plan
 /* fseries kernel */
-#define CUONEDIMFSERIESKERNEL cuonedimfserieskernel
+#define CUFSERIESKERNELCOMPUTE cufserieskernelcompute
 
 #endif
 
diff --git a/src/common.cu b/src/common.cu
@@ -9,17 +9,20 @@
 
 using namespace std;
 
-/* TODO */
+/* Kernel for computing approximations of exact Fourier series coeffs of
+   cnufftspread's real symmetric kernel. */
+// a , f are intermediate results from function onedim_fseries_kernel_precomp()
+// (see cufinufft/contrib/common.cpp for description)
 __global__
-void OnedimFseriesKernel(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns)
+void FseriesKernelCompute(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a,
+	FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns)
 {
 	FLT J2 = ns/2.0;
 	int q=(int)(2 + 3.0*J2);
 	int nf;
-	//cuDoubleComplex aj[MAX_NQUAD];
 	cuDoubleComplex *at = a + threadIdx.y*MAX_NQUAD;
 	FLT *ft = f + threadIdx.y*MAX_NQUAD;
-	FLT *oarr; 
+	FLT *oarr;
 	if (threadIdx.y == 0){
 		oarr = fwkerhalf1;
 		nf = nf1;
@@ -41,18 +44,22 @@ void OnedimFseriesKernel(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a,
 	}
 }
 
-int CUONEDIMFSERIESKERNEL(int dim, int nf1, int nf2, int nf3, FLT *d_f, cuDoubleComplex *d_a, 
-		FLT *d_fwkerhalf1, FLT *d_fwkerhalf2, FLT *d_fwkerhalf3, int ns)
-/* 
-	TODO
+int CUFSERIESKERNELCOMPUTE(int dim, int nf1, int nf2, int nf3, FLT *d_f,
+	cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2,
+	FLT *d_fwkerhalf3, int ns)
+/*
+	wrapper for approximation of Fourier series of real symmetric spreading 
+	kernel.
+
+	Melody Shih 2/20/22
 */
 {
 	int nout = max(max(nf1/2+1,nf2/2+1),nf3/2+1);
 
 	dim3 threadsPerBlock(16, dim);
 	dim3 numBlocks((nout+16-1)/16, 1);
 
-	OnedimFseriesKernel<<<numBlocks, threadsPerBlock>>>(nf1, nf2, nf3, d_f, d_a, 
-			d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
+	FseriesKernelCompute<<<numBlocks, threadsPerBlock>>>(nf1, nf2, nf3, d_f,
+		d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
 	return 0;
 }
diff --git a/src/common.h b/src/common.h
@@ -2,11 +2,8 @@
 #define __COMMON_H__ 
 #include <cufinufft_eitherprec.h>
 
-//__global__
-//void OnedimFseriesKernel(int nf, FLT *f, cuDoubleComplex* a, FLT *fwkerhalf, int ns);
 __global__
-void OnedimFseriesKernel(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns);
+void FseriesKernelCompute(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns);
 
-//int CUONEDIMFSERIESKERNEL(int nf, FLT *d_f, cuDoubleComplex* d_a, FLT *d_fwkerhalf, int ns);
-int CUONEDIMFSERIESKERNEL(int dim, int nf1, int nf2, int nf3, FLT *d_f, cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2, FLT *d_fwkerhalf3, int ns);
+int CUFSERIESKERNELCOMPUTE(int dim, int nf1, int nf2, int nf3, FLT *d_f, cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2, FLT *d_fwkerhalf3, int ns);
 #endif
diff --git a/src/cufinufft.cu b/src/cufinufft.cu
@@ -235,77 +235,37 @@ This performs:
 	cudaEventElapsedTime(&milliseconds, start, stop);
 	printf("[time  ] \tCUFFT Plan\t\t %.3g s\n", milliseconds/1000);
 #endif
-	CNTime timer; 
-	if (max(nf1, max(nf2, nf3)) < 2e3) {
-		timer.start();
-		FLT *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
-
-		fwkerhalf1 = (FLT*)malloc(sizeof(FLT)*(nf1/2+1));
-		onedim_fseries_kernel(nf1, fwkerhalf1, d_plan->spopts);
-		if(dim > 1){
-			fwkerhalf2 = (FLT*)malloc(sizeof(FLT)*(nf2/2+1));
-			onedim_fseries_kernel(nf2, fwkerhalf2, d_plan->spopts);
-		}
-		if(dim > 2){
-			fwkerhalf3 = (FLT*)malloc(sizeof(FLT)*(nf3/2+1));
-			onedim_fseries_kernel(nf3, fwkerhalf3, d_plan->spopts);
-		}
-#ifdef TIME
-		printf("[time  ] \tkernel fser (on CPU):\t %.3g s\n", timer.elapsedsec());
-#endif
-		cudaEventRecord(start);
-		checkCudaErrors(cudaMemcpy(d_plan->fwkerhalf1,fwkerhalf1,(nf1/2+1)*
-			sizeof(FLT),cudaMemcpyHostToDevice));
-		if(dim > 1)
-			checkCudaErrors(cudaMemcpy(d_plan->fwkerhalf2,fwkerhalf2,(nf2/2+1)*
-				sizeof(FLT),cudaMemcpyHostToDevice));
-		if(dim > 2)
-			checkCudaErrors(cudaMemcpy(d_plan->fwkerhalf3,fwkerhalf3,(nf3/2+1)*
-				sizeof(FLT),cudaMemcpyHostToDevice));
-#ifdef TIME
-		cudaEventRecord(stop);
-		cudaEventSynchronize(stop);
-		cudaEventElapsedTime(&milliseconds, start, stop);
-		printf("[time  ] \tCopy fwkerhalf HtoD\t %.3g s\n", milliseconds/1000);
-#endif
-		free(fwkerhalf1);
-		if(dim > 1)
-			free(fwkerhalf2);
-		if(dim > 2)
-			free(fwkerhalf3);
-	} else {
-		timer.start();
-		complex<double> a[3*MAX_NQUAD];
-		FLT             f[3*MAX_NQUAD];
-		onedim_fseries_kernel_1sthalf(nf1, f, a, d_plan->spopts);
-		if(dim > 1){
-			onedim_fseries_kernel_1sthalf(nf2, f+MAX_NQUAD, a+MAX_NQUAD, d_plan->spopts);
-		}
-		if(dim > 2){
-			onedim_fseries_kernel_1sthalf(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, d_plan->spopts);
-		}
+	CNTime timer; timer.start();
+	complex<double> a[3*MAX_NQUAD];
+	FLT             f[3*MAX_NQUAD];
+	onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
+	if(dim > 1){
+		onedim_fseries_kernel_precomp(nf2, f+MAX_NQUAD, a+MAX_NQUAD, d_plan->spopts);
+	}
+	if(dim > 2){
+		onedim_fseries_kernel_precomp(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, d_plan->spopts);
+	}
 #ifdef TIME
-		printf("[time  ] \tkernel fser (1st half on CPU):\t %.3g s\n", timer.elapsedsec());
+	printf("[time  ] \tkernel fser (1st half on CPU):\t %.3g s\n", timer.elapsedsec());
 #endif
 
-		cudaEventRecord(start);
-		cuDoubleComplex *d_a;
-		FLT   *d_f;
-		checkCudaErrors(cudaMalloc(&d_a, dim*MAX_NQUAD*sizeof(cuDoubleComplex)));
-		checkCudaErrors(cudaMalloc(&d_f, dim*MAX_NQUAD*sizeof(FLT)));
-		checkCudaErrors(cudaMemcpy(d_a,a,dim*MAX_NQUAD*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
-		checkCudaErrors(cudaMemcpy(d_f,f,dim*MAX_NQUAD*sizeof(FLT),cudaMemcpyHostToDevice));
-		ier = CUONEDIMFSERIESKERNEL(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1,
-			d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->spopts.nspread);
+	cudaEventRecord(start);
+	cuDoubleComplex *d_a;
+	FLT   *d_f;
+	checkCudaErrors(cudaMalloc(&d_a, dim*MAX_NQUAD*sizeof(cuDoubleComplex)));
+	checkCudaErrors(cudaMalloc(&d_f, dim*MAX_NQUAD*sizeof(FLT)));
+	checkCudaErrors(cudaMemcpy(d_a,a,dim*MAX_NQUAD*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpy(d_f,f,dim*MAX_NQUAD*sizeof(FLT),cudaMemcpyHostToDevice));
+	ier = CUFSERIESKERNELCOMPUTE(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1,
+		d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->spopts.nspread);
 #ifdef TIME
-		cudaEventRecord(stop);
-		cudaEventSynchronize(stop);
-		cudaEventElapsedTime(&milliseconds, start, stop);
-		printf("[time  ] \tkernel fser (2nd half on GPU)\t %.3g s\n", milliseconds/1000);
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&milliseconds, start, stop);
+	printf("[time  ] \tkernel fser (2nd half on GPU)\t %.3g s\n", milliseconds/1000);
 #endif
-		cudaFree(d_a);
-		cudaFree(d_f);
-	}
+	cudaFree(d_a);
+	cudaFree(d_f);
 	// Multi-GPU support: reset the device ID
         cudaSetDevice(orig_gpu_device_id);
 
diff --git a/src/precision_independent.cu b/src/precision_independent.cu
@@ -12,10 +12,6 @@
 /* Auxiliary func to compute power of complex number */
 __device__ RT carg(const CT& z) {return (RT)atan2(ipart(z), rpart(z));} // polar angle
 __device__ RT cabs(const CT& z) {return (RT)cuCabs(z);}
-__device__ CT cpow(const CT& z, const int &n) {
-	RT abs_z_n = pow(cabs(z), n);
-	return cmplx(abs_z_n*cos(n*carg(z)), abs_z_n*sin(n*carg(z)));
-}
 
 /* Common Kernels from spreadinterp3d */
 __host__ __device__
diff --git a/test/fseries_kernel_test.cu b/test/fseries_kernel_test.cu
@@ -106,11 +106,11 @@ int main(int argc, char* argv[])
 		timer.start();
 		complex<double> a[dim*MAX_NQUAD];
 		FLT             f[dim*MAX_NQUAD];
-		onedim_fseries_kernel_1sthalf(nf1, f, a, opts);
+		onedim_fseries_kernel_precomp(nf1, f, a, opts);
 		if(dim > 1)
-			onedim_fseries_kernel_1sthalf(nf2, f+MAX_NQUAD, a+MAX_NQUAD, opts);
+			onedim_fseries_kernel_precomp(nf2, f+MAX_NQUAD, a+MAX_NQUAD, opts);
 		if(dim > 2)
-			onedim_fseries_kernel_1sthalf(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, opts);
+			onedim_fseries_kernel_precomp(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, opts);
 		cputime = timer.elapsedsec();
 
 		cuDoubleComplex *d_a;
@@ -123,7 +123,7 @@ int main(int argc, char* argv[])
 				dim*MAX_NQUAD*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
 			checkCudaErrors(cudaMemcpy(d_f,f,
 				dim*MAX_NQUAD*sizeof(FLT),cudaMemcpyHostToDevice));
-			ier = CUONEDIMFSERIESKERNEL(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1,
+			ier = CUFSERIESKERNELCOMPUTE(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1,
 				d_fwkerhalf2, d_fwkerhalf3, opts.nspread);
 		}
 		cudaEventRecord(stop);
@@ -151,12 +151,14 @@ int main(int argc, char* argv[])
 	for(int i=0; i<nf1/2+1; i++)
 		printf("%10.8e ", fwkerhalf1[i]);
 	printf("\n");
-	for(int i=0; i<nf2/2+1; i++)
-		printf("%10.8e ", fwkerhalf2[i]);
-	printf("\n");
-	for(int i=0; i<nf3/2+1; i++)
-		printf("%10.8e ", fwkerhalf3[i]);
-	printf("\n");
+	if(dim > 1)
+		for(int i=0; i<nf2/2+1; i++)
+			printf("%10.8e ", fwkerhalf2[i]);
+		printf("\n");
+	if(dim > 2)
+		for(int i=0; i<nf3/2+1; i++)
+			printf("%10.8e ", fwkerhalf3[i]);
+		printf("\n");
 #endif
 
 	return 0;
diff --git a/test/fseriesperf.sh b/test/fseriesperf.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# basic perf test of spread/interp for 2/3d, single/double
-# Barnett 1/29/21, some 1D added 12/2/21.
+# basic perf test of compute fseries for 1d, single/double
+# Melody 02/20/22
 
 BIN=../bin/fseries_kernel_test
 DIM=1