fft_plan fft_plan_dft_2d( int n[2], std::complex<double> *in, std::complex<double> *out, int sign ) { fft_plan plan = NULL; #ifdef HAVE_LIBFFTW3 # ifdef HAVE_LIBPTHREAD pthread_mutex_lock(&mutex); # endif fftw_plan p; p = fftw_plan_dft(2, n, (fftw_complex*)in, (fftw_complex*)out, sign, FFTW_ESTIMATE); # ifdef HAVE_LIBPTHREAD pthread_mutex_unlock(&mutex); # endif if(NULL != p) { plan = (fft_plan)malloc(sizeof(tag_fft_plan)); plan->plan = p; } #else kiss_fftnd_cfg cfg; cfg = kiss_fftnd_alloc(n, 2, sign, NULL, NULL); if(NULL != cfg) { plan = (fft_plan)malloc(sizeof(tag_fft_plan)); plan->cfg = cfg; plan->in = in; plan->out = out; } #endif return plan; }
static void fft_filend(FILE * fin, FILE * fout, int *dims, int ndims, int isinverse) { kiss_fftnd_cfg st; kiss_fft_cpx *buf; int dimprod = 1, i; for (i = 0; i < ndims; ++i) dimprod *= dims[i]; buf = (kiss_fft_cpx *) malloc(sizeof(kiss_fft_cpx) * dimprod); st = kiss_fftnd_alloc(dims, ndims, isinverse, 0, 0); while (fread(buf, sizeof(kiss_fft_cpx) * dimprod, 1, fin) > 0) { kiss_fftnd(st, buf, buf); fwrite(buf, sizeof(kiss_fft_cpx), dimprod, fout); } free(st); free(buf); }
int main(int argc, char ** argv) { int k; int nfft[32]; int ndims = 1; int isinverse = 0; int numffts = 1000, i; kiss_fft_cpx * buf; kiss_fft_cpx * bufout; int real = 0; nfft[0] = 1024;// default while (1) { int c = getopt(argc, argv, "n:ix:r"); if (c == -1) break; switch (c) { case 'r': real = 1; break; case 'n': ndims = getdims(nfft, optarg); if (nfft[0] != kiss_fft_next_fast_size(nfft[0])) { int ng = kiss_fft_next_fast_size(nfft[0]); fprintf(stderr, "warning: %d might be a better choice for speed than %d\n", ng, nfft[0]); } break; case 'x': numffts = atoi(optarg); break; case 'i': isinverse = 1; break; } } int nbytes = sizeof(kiss_fft_cpx); for (k = 0; k < ndims; ++k) nbytes *= nfft[k]; #ifdef USE_SIMD numffts /= 4; fprintf(stderr, "since SIMD implementation does 4 ffts at a time, numffts is being reduced to %d\n", numffts); #endif buf = (kiss_fft_cpx*)KISS_FFT_MALLOC(nbytes); bufout = (kiss_fft_cpx*)KISS_FFT_MALLOC(nbytes); memset(buf, 0, nbytes); pstats_init(); if (ndims == 1) { if (real) { kiss_fftr_cfg st = kiss_fftr_alloc(nfft[0] , isinverse , 0, 0); if (isinverse) for (i = 0; i < numffts; ++i) kiss_fftri(st , (kiss_fft_cpx*)buf, (kiss_fft_scalar*)bufout); else for (i = 0; i < numffts; ++i) kiss_fftr(st , (kiss_fft_scalar*)buf, (kiss_fft_cpx*)bufout); free(st); } else { kiss_fft_cfg st = kiss_fft_alloc(nfft[0] , isinverse , 0, 0); for (i = 0; i < numffts; ++i) kiss_fft(st , buf, bufout); free(st); } } else { if (real) { kiss_fftndr_cfg st = kiss_fftndr_alloc(nfft, ndims , isinverse , 0, 0); if (isinverse) for (i = 0; i < numffts; ++i) kiss_fftndri(st , (kiss_fft_cpx*)buf, (kiss_fft_scalar*)bufout); else for (i = 0; i < numffts; ++i) kiss_fftndr(st , (kiss_fft_scalar*)buf, (kiss_fft_cpx*)bufout); free(st); } else { kiss_fftnd_cfg st = kiss_fftnd_alloc(nfft, ndims, isinverse , 0, 0); for (i = 0; i < numffts; ++i) kiss_fftnd(st , buf, bufout); free(st); } } free(buf); free(bufout); fprintf(stderr, "KISS\tnfft="); for (k = 0; k < ndims; ++k) fprintf(stderr, "%d,", nfft[k]); fprintf(stderr, "\tnumffts=%d\n" , numffts); pstats_report(); kiss_fft_cleanup(); return 0; }
/* generic complex <N>d-transform. */ int tcl_cfft_nd(ClientData nodata, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]) { Tcl_Obj *result, **tdata[FFT_MAX_DIM]; const char *name; kiss_fft_cpx *input; kiss_fft_cpx *output; kiss_fftnd_cfg work; int dir, ndim, alldim, ndat[FFT_MAX_DIM]; int i; Tcl_MutexLock(&myFftMutex); /* set defaults: */ dir = FFT_FORWARD; ndim = -1; /* Parse arguments: * * usage: cfftf_nd <data> * or: cfftb_nd <data> * * cfftf_nd : is the Nd complex forward transform. * cfftb_nd : is the Nd complex backward transform. * <data> : list containing data to be transformed. this can either a real * or a list with two reals interpreted as complex. */ name = Tcl_GetString(objv[0]); if (strcmp(name,"cfftf_2d") == 0) { dir = FFT_FORWARD; ndim = 2; } else if (strcmp(name,"cfftb_2d") == 0) { dir = FFT_BACKWARD; ndim = 2; } else if (strcmp(name,"cfftf_3d") == 0) { dir = FFT_FORWARD; ndim = 3; } else if (strcmp(name,"cfftb_3d") == 0) { dir = FFT_BACKWARD; ndim = 3; } else if (strcmp(name,"cfftf_4d") == 0) { dir = FFT_FORWARD; ndim = 4; } else if (strcmp(name,"cfftb_4d") == 0) { dir = FFT_BACKWARD; ndim = 4; } else { Tcl_AppendResult(interp, name, ": unknown fft command.", NULL); Tcl_MutexUnlock(&myFftMutex); return TCL_ERROR; } if (objc != 2) { Tcl_WrongNumArgs(interp, 1, objv, "<data>"); Tcl_MutexUnlock(&myFftMutex); return TCL_ERROR; } /* mark data as busy and check */ Tcl_IncrRefCount(objv[1]); if (Tcl_ListObjGetElements(interp, objv[1], &(ndat[0]), &(tdata[0])) != TCL_OK) { Tcl_DecrRefCount(objv[1]); Tcl_MutexUnlock(&myFftMutex); return TCL_ERROR; } if ((ndat[0] < 0) || (ndim > FFT_MAX_DIM)) { /* this should not happen, but... */ Tcl_AppendResult(interp, name, ": illegal or unsupported data array.", NULL); Tcl_DecrRefCount(objv[1]); Tcl_MutexUnlock(&myFftMutex); return TCL_ERROR; } if (ndat[0] == 0) { /* no effect for empty array */ Tcl_DecrRefCount(objv[1]); Tcl_SetObjResult(interp, objv[1]); Tcl_MutexUnlock(&myFftMutex); return TCL_OK; } check_thread_count(interp,"fftcmds"); /* determine size of each dimension for storage size and parsing/checking. */ alldim=ndat[0]; for (i=1; i<ndim; ++i) { if (Tcl_ListObjGetElements(interp, tdata[i-1][0], &(ndat[i]), &(tdata[i])) != TCL_OK) { Tcl_DecrRefCount(objv[1]); Tcl_MutexUnlock(&myFftMutex); return TCL_ERROR; } alldim *= ndat[i]; } input = (void *)Tcl_Alloc(alldim*sizeof(kiss_fft_cpx)); output = (void *)Tcl_Alloc(alldim*sizeof(kiss_fft_cpx)); work = kiss_fftnd_alloc(ndat, ndim, dir, NULL, NULL); /* parse/copy data list through recursive function and release original data. */ alldim=0; for (i=0; i<ndat[0]; ++i) { if (read_list_list(interp, tdata[0][i], 1, ndim, ndat, input, &alldim) != TCL_OK) { Tcl_AppendResult(interp, name, ": illegal data array.", NULL); Tcl_DecrRefCount(objv[1]); Tcl_MutexUnlock(&myFftMutex); return TCL_ERROR; } } Tcl_DecrRefCount(objv[1]); /* finally run the transform */ kiss_fftnd(work, input, output); /* build result list(s) recursively */ result = Tcl_NewListObj(0, NULL); alldim = 0; for (i=0; i<ndat[0]; ++i) { make_list_list(interp, result, 1, ndim, ndat, output, &alldim); } Tcl_SetObjResult(interp, result); /* free intermediate storage */ Tcl_Free((char *)input); Tcl_Free((char *)output); kiss_fft_free(work); kiss_fft_cleanup(); Tcl_MutexUnlock(&myFftMutex); return TCL_OK; }
template<class Tsrc> FIBITMAP* FFT2D<Tsrc>::FFT(FIBITMAP *src) { int height, width; int i=0, x, y; int dims[2]; int ndims = 2; size_t bufsize; Tsrc *bits; FICOMPLEX *outbits; FIBITMAP *dst = NULL; kiss_fftnd_cfg st; kiss_fft_cpx* fftbuf; kiss_fft_cpx* fftoutbuf; kiss_fft_cpx* tmp_fftoutbuf; // Dims needs to be {rows, cols}, if you have contiguous rows. dims[0] = height = FreeImage_GetHeight(src); dims[1] = width = FreeImage_GetWidth(src); bufsize = width * height * sizeof(kiss_fft_cpx); fftbuf = (kiss_fft_cpx*) malloc(bufsize); tmp_fftoutbuf = fftoutbuf = (kiss_fft_cpx*) malloc(bufsize); CheckMemory(fftbuf); CheckMemory(fftoutbuf); memset(fftbuf,0,bufsize); memset(tmp_fftoutbuf,0,bufsize); st = kiss_fftnd_alloc (dims, ndims, 0, 0, 0); for(y = height - 1; y >= 0; y--) { bits = (Tsrc *) FreeImage_GetScanLine(src, y); for(x=0; x < width; x++) { fftbuf[i].r = (float) bits[x]; fftbuf[i].i = 0.0; i++; } } kiss_fftnd(st, fftbuf, tmp_fftoutbuf); if ( (dst = FreeImage_AllocateT(FIT_COMPLEX, width, height, 32, 0, 0, 0)) == NULL ) goto Error; for(y = height - 1; y >= 0; y--) { outbits = (FICOMPLEX *) FreeImage_GetScanLine(dst, y); for(x=0; x < width; x++) { (outbits + x)->r = (double)((tmp_fftoutbuf + x)->r); (outbits + x)->i = (double)((tmp_fftoutbuf + x)->i); } tmp_fftoutbuf += width; } Error: free(fftbuf); free(fftoutbuf); free(st); return dst; }
/* Compare a 3d FFT against a reference FFT */ void test_distributed_fft_3d_compare() { int s,p; MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &s); int nd = 3; int *pdim; pdim = (int *) malloc(sizeof(int)*nd); /* choose a decomposition */ int r = powf(p,1.0/(double)nd)+0.5; int root = 1; while (root < r) root*=2; int ptot = 1; if (!s) printf("Processor grid: "); int i; for (i = 0; i < nd-1; ++i) { pdim[i] = (((ptot*root) > p) ? 1 : root); ptot *= pdim[i]; if (!s) printf("%d x ",pdim[i]); } pdim[nd-1] = p/ptot; if (!s) printf("%d\n", pdim[nd-1]); /* determine processor index */ int *pidx; pidx = (int*)malloc(nd*sizeof(int)); int idx = s; for (i = nd-1; i >= 0; --i) { pidx[i] = idx % pdim[i]; idx /= pdim[i]; } int *dim_glob; dim_glob = (int *) malloc(sizeof(int)*nd); // Do a pdim[0]*4 x pdim[1]* 8 x pdim[2] * 16 FFT (powers of two) int local_nx = 4; int local_ny = 8; int local_nz = 16; dim_glob[0] = pdim[0]*local_nx; dim_glob[1] = pdim[1]*local_ny; dim_glob[2] = pdim[2]*local_nz; for (i = 0; i < nd-1; ++i) if (!s) printf("%d x ",dim_glob[i]); if (!s) printf("%d matrix\n", dim_glob[nd-1]); float scale = dim_glob[0]*dim_glob[1]*dim_glob[2]; /* assume 0.5 sig digit loss per addition/twiddling (empirical)*/ float sig_digits = 7.0-0.5*logf(scale)/logf(2.0); double tol = powf(10.0,-sig_digits); double abs_tol = 1.0*tol; printf("Testing with %f sig digits, rel precision %f, abs precision %f\n", sig_digits, tol, abs_tol); kiss_fft_cpx *in_kiss; in_kiss = (kiss_fft_cpx *)malloc(sizeof(kiss_fft_cpx)*dim_glob[0]*dim_glob[1]*dim_glob[2]); srand(12345); // fill table with complex random numbers in row major order int x,y,z; int nx = dim_glob[0]; int ny = dim_glob[1]; int nz = dim_glob[2]; for (x = 0; x < dim_glob[0]; ++x) for (y = 0; y < dim_glob[1]; ++y) for (z = 0; z < dim_glob[2]; ++z) { // KISS has column-major storage in_kiss[z+nz*(y+ny*x)].r = (float)rand()/(float)RAND_MAX; in_kiss[z+nz*(y+ny*x)].i =(float)rand()/(float)RAND_MAX; } kiss_fft_cpx *out_kiss; out_kiss = (kiss_fft_cpx *)malloc(sizeof(kiss_fft_cpx)*dim_glob[0]*dim_glob[1]*dim_glob[2]); // construct forward transform kiss_fftnd_cfg cfg = kiss_fftnd_alloc(dim_glob,3,0,NULL,NULL); // carry out conventional FFT kiss_fftnd(cfg, in_kiss, out_kiss); // compare to distributed FFT cuda_cpx_t * in_d, *in_h; cudaMalloc((void **)&in_d,sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz); in_h = (cuda_cpx_t *) malloc(sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz); int x_local, y_local, z_local; for (x = 0; x < nx; ++x) for (y = 0; y < ny; ++y) for (z = 0; z < nz; ++z) { if (x>=pidx[0]*local_nx && x < (pidx[0]+1)*local_nx && y>=pidx[1]*local_ny && y < (pidx[1]+1)*local_ny && z>=pidx[2]*local_nz && z < (pidx[2]+1)*local_nz) { x_local = x - pidx[0]*local_nx; y_local = y - pidx[1]*local_ny; z_local = z - pidx[2]*local_nz; CUDA_RE(in_h[z_local+local_nz*(y_local+local_ny*x_local)]) = in_kiss[z+nz*(y+ny*x)].r; CUDA_IM(in_h[z_local+local_nz*(y_local+local_ny*x_local)]) = in_kiss[z+nz*(y+ny*x)].i; } } cuda_cpx_t *out_d, *out_h; cudaMalloc((void **)&out_d,sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz); out_h = (cuda_cpx_t *) malloc(sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz); dfft_plan plan; dfft_cuda_create_plan(&plan,3, dim_glob, NULL, NULL, pdim, pidx,0, 0, 0, MPI_COMM_WORLD, proc_map); dfft_cuda_check_errors(&plan,1); /* copy data to device */ cudaMemcpy(in_d, in_h, sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz, cudaMemcpyDefault); // forward transform dfft_cuda_execute(in_d, out_d, 0, &plan); /* copy data back to host */ cudaMemcpy(out_h, out_d, sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz, cudaMemcpyDefault); // do comparison int n_wave_local = local_nx * local_ny * local_nz; for (i = 0; i < n_wave_local; ++i) { x_local = i / local_ny / local_nz; y_local = (i - x_local*local_ny*local_nz)/local_nz; z_local = i % local_nz; x = pidx[0]*local_nx + x_local; y = pidx[1]*local_ny + y_local; z = pidx[2]*local_nz + z_local; double re = CUDA_RE(out_h[i]); double im = CUDA_IM(out_h[i]); double re_kiss = out_kiss[z+nz*(y+ny*x)].r; double im_kiss = out_kiss[z+nz*(y+ny*x)].i; if (fabs(re_kiss) < abs_tol) { CHECK_SMALL(re,2*abs_tol); } else { CHECK_CLOSE(re,re_kiss, tol); } if (fabs(im_kiss) < abs_tol) { CHECK_SMALL(im,2*abs_tol); } else { CHECK_CLOSE(im, im_kiss, tol); } } free(in_kiss); free(out_kiss); cudaFree(out_d); cudaFree(in_d); free(in_h); free(out_h); free(pidx); free(dim_glob); dfft_cuda_destroy_plan(plan); }