void plan_fft(FFT_plans *plans, Arrays *arr, Detector_settings *sett, Command_line_opts *opts) { /* ############ FFT Plans ################ */ //arrlen is maximum of Ninterp and fftpad*nfft arr->arr_len = (sett->fftpad * sett->nfft > sett->Ninterp ? sett->fftpad * sett->nfft : sett->Ninterp); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa, arr->arr_len*sizeof(cufftDoubleComplex)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb, arr->arr_len*sizeof(cufftDoubleComplex)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xar, arr->arr_len*sizeof(cufftDoubleComplex)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xbr, arr->arr_len*sizeof(cufftDoubleComplex)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa_f, arr->arr_len*sizeof(COMPLEX_TYPE)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb_f, arr->arr_len*sizeof(COMPLEX_TYPE)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xar_f, arr->arr_len*sizeof(COMPLEX_TYPE)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xbr_f, arr->arr_len*sizeof(COMPLEX_TYPE)) ); if (opts->fftinterp == INT) { //interbinning CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa2_f, sett->nfft*sizeof(COMPLEX_TYPE)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb2_f, sett->nfft*sizeof(COMPLEX_TYPE)) ); } sett->nfftf = sett->fftpad*sett->nfft; if (opts->fftinterp == INT) { //interbinning cufftPlan1d( &(plans->plan), sett->nfft, CUFFT_TRANSFORM_TYPE, 1); } else { //fft & zero padding cufftPlan1d( &(plans->plan), sett->nfftf, CUFFT_TRANSFORM_TYPE, 1); } //plans for interpolation with splines cufftPlan1d(&(plans->pl_int), sett->nfft, CUFFT_Z2Z, 1); cufftPlan1d(&(plans->pl_inv), sett->Ninterp, CUFFT_Z2Z, 1); /* ############ FFT plans end ################ */ }
/*! computes the N-point DFT of signal X and stores in Y using CUDA's FFT library */ bool cuda_dft(cuComplex *Y, cuComplex *X, float scale, int N) { size_t bytes = (size_t)N * sizeof(cuComplex); cuComplex *Y_gpu, *X_gpu; cudaMalloc((void **)&Y_gpu, bytes); cudaMalloc((void **)&X_gpu, bytes); cudaMemcpy(Y_gpu, Y, bytes, cudaMemcpyHostToDevice); cudaMemcpy(X_gpu, X, bytes, cudaMemcpyHostToDevice); cufftHandle plan; cufftPlan1d(&plan, N, CUFFT_C2C, 1); cufftExecC2C(plan, X_gpu, Y_gpu, CUFFT_FORWARD); cufftDestroy(plan); cudaMemcpy(Y, Y_gpu, bytes, cudaMemcpyDeviceToHost); cudaFree(Y_gpu); cudaFree(X_gpu); for (int n = 0; n < N; n++) { Y[n].x *= scale; Y[n].y *= scale; } return true; }
extern "C" void cuda_fft(double *d_data, int Lx, int Ny, void *stream) { cufftHandle plan; cufftPlan1d(&plan, Lx, CUFFT_Z2Z, Ny); cufftSetStream(plan, (cudaStream_t)stream); cufftExecZ2Z(plan, (cufftDoubleComplex*)d_data, (cufftDoubleComplex*)d_data,CUFFT_FORWARD); cufftDestroy(plan); }
extern "C" void cuda_fft(float *d_data, int Lx, int Ny, void *stream) { cufftHandle plan; cufftPlan1d(&plan, Lx, CUFFT_C2C, Ny); cufftSetStream(plan, (cudaStream_t)stream); cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD); cufftDestroy(plan); }
void g_fwdfft(QSP_ARG_DECL Data_Obj *dst_dp, Data_Obj *src1_dp) { //Variable declarations int NX = 256; //int BATCH = 10; int BATCH = 1; cufftResult_t status; //Declare plan for FFT cufftHandle plan; //cufftComplex *data; //cufftComplex *result; void *data; void *result; cudaError_t drv_err; //Allocate RAM //cutilSafeCall(cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH)); //cutilSafeCall(cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH)); drv_err = cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH); if( drv_err != cudaSuccess ){ WARN("error allocating cuda data buffer for fft!?"); return; } drv_err = cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH); if( drv_err != cudaSuccess ){ WARN("error allocating cuda result buffer for fft!?"); // BUG clean up previous malloc... return; } //Create plan for FFT status = cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftPlan1d: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); } //Run forward fft on data status = cufftExecC2C(plan, (cufftComplex *)data, (cufftComplex *)result, CUFFT_FORWARD); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); } //Run inverse fft on data /*status = cufftExecC2C(plan, data, result, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); }*/ //Free resources cufftDestroy(plan); cudaFree(data); }
/* * Function to be called in thread managing host operations and invoking kernels */ void* host_thread(void* passing_ptr) { DataArray* data_arr_ptr = (DataArray*) passing_ptr; alloc_data_host(data_arr_ptr); printf("data allocated by host thread\n"); //printf("data filling by host thread\n"); for (uint64_t ii = 0; ii < data_arr_ptr->size; ii++) { (*(data_arr_ptr->data_r))[ii] = ii; (*(data_arr_ptr->data_k))[ii] = data_arr_ptr->size-ii; } printf("data filled by host thread\n"); // synchronize after allocating memory - streams should be created, mem on device ready for copying pthread_barrier_wait (&barrier); printf("1st barier host thread - allocating mem on cpu\n"); // here we can make cufft plan, for example cufftHandle plan_forward; cufftPlan1d(&plan_forward, N, CUFFT_Z2Z, 1); // synchornize after ... - data should be copyied on device pthread_barrier_wait (&barrier); printf("2nd barier host thread - \n"); // run some computations cufftExecZ2Z(plan_forward, *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_k_dev), CUFFT_FORWARD); printf("cufft done\n"); // synchornize after computations - cudaDeviceSynchronize(); // should be used on pthread_barrier_wait (&barrier); printf("3rd barier host thread - \n"); // synchornize after computations - pthread_barrier_wait (&barrier); printf("4th barier host thread - \n"); printf("data visible in host thread:\n"); /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) { printf("%lu.\t",ii); printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] )); printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] )); }*/ printf("closing host thread\n"); pthread_exit(NULL); }
void cuda_make_cufft_plan(cufftHandle *plan, int type, size_t n, size_t batch) { cufftType cufft_type = CUFFT_C2C; if (type == 0) cufft_type = CUFFT_C2C; else if (type == 1) cufft_type = CUFFT_R2C; else if (type == 2) cufft_type = CUFFT_C2R; cudaError_t status = cufftPlan1d(plan, n, cufft_type, batch); check_error(status); }
void Fastconv_base<D, T, C>::fconv (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel) { // convert pointers to types the CUFFT library accepts typedef cufftComplex ctype; ctype* d_out = reinterpret_cast<ctype*>(out); ctype* d_kernel = const_cast<ctype*>(reinterpret_cast<ctype const*>(kernel)); ctype* d_in = const_cast<ctype*>(reinterpret_cast<ctype const*>(in)); cufftHandle plan; if (transform_kernel) { // Create a 1D FFT plan and transform the kernel cufftPlan1d(&plan, columns, CUFFT_C2C, 1); cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD); cufftDestroy(plan); } // Create a FFTM plan cufftPlan1d(&plan, columns, CUFFT_C2C, rows); // transform the data cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD); // convolve with kernel, combine with scaling needed for inverse FFT typedef typename impl::scalar_of<T>::type scalar_type; scalar_type scale = 1 / static_cast<scalar_type>(columns); if (D == 1) vmmuls_row(kernel, in, out, scale, rows, columns); else mmmuls(kernel, in, out, scale, rows, columns); // inverse transform the signal cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE); cufftDestroy(plan); }
/* * Class: jcuda_jcufft_JCufft * Method: cufftPlan1dNative * Signature: (Ljcuda/jcufft/JCufftHandle;III)I */ JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftPlan1dNative (JNIEnv *env, jclass cla, jobject handle, jint nx, jint type, jint batch) { if (handle == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftPlan1d"); return JCUFFT_INTERNAL_ERROR; } Logger::log(LOG_TRACE, "Creating 1D plan for %d elements of type %d\n", nx, type); cufftHandle plan = env->GetIntField(handle, cufftHandle_plan); cufftResult result = cufftPlan1d(&plan, nx, getCufftType(type), batch); env->SetIntField(handle, cufftHandle_plan, plan); return result; }
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) { if (nrhs!=4) mexErrMsgTxt("Wrong number of arguments"); cufftHandle plan = (cufftHandle) mxGetScalar(prhs[0]); int nx = (int) mxGetScalar(prhs[1]); cufftType_t type = (cufftType_t) ((int) mxGetScalar(prhs[2])); int batch = (int) mxGetScalar(prhs[3]); cufftResult status = cufftPlan1d(&plan, nx, type, batch); plhs[0] = mxCreateDoubleScalar(status); if (nlhs>1) plhs[1] = mxCreateDoubleScalar(plan); }
oskar_FFT* oskar_fft_create(int precision, int location, int num_dim, int dim_size, int batch_size_1d, int* status) { int i; oskar_FFT* h = (oskar_FFT*) calloc(1, sizeof(oskar_FFT)); #ifndef OSKAR_HAVE_CUDA if (location == OSKAR_GPU) location = OSKAR_CPU; #endif #ifndef OSKAR_HAVE_OPENCL if (location & OSKAR_CL) location = OSKAR_CPU; #endif h->precision = precision; h->location = location; h->num_dim = num_dim; h->dim_size = dim_size; h->ensure_consistent_norm = 1; h->num_cells_total = (size_t) dim_size; for (i = 1; i < num_dim; ++i) h->num_cells_total *= (size_t) dim_size; if (location == OSKAR_CPU) { int len = 4 * dim_size + 2 * (int)(log((double)dim_size) / log(2.0)) + 8; h->fftpack_wsave = oskar_mem_create(precision, location, len, status); if (num_dim == 1) { (void) batch_size_1d; *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE; } else if (num_dim == 2) { if (precision == OSKAR_DOUBLE) oskar_fftpack_cfft2i(dim_size, dim_size, oskar_mem_double(h->fftpack_wsave, status)); else oskar_fftpack_cfft2i_f(dim_size, dim_size, oskar_mem_float(h->fftpack_wsave, status)); } else *status = OSKAR_ERR_INVALID_ARGUMENT; h->fftpack_work = oskar_mem_create(precision, location, 2 * h->num_cells_total, status); } else if (location == OSKAR_GPU) { #ifdef OSKAR_HAVE_CUDA if (num_dim == 1) cufftPlan1d(&h->cufft_plan, dim_size, ((precision == OSKAR_DOUBLE) ? CUFFT_Z2Z : CUFFT_C2C), batch_size_1d); else if (num_dim == 2) cufftPlan2d(&h->cufft_plan, dim_size, dim_size, ((precision == OSKAR_DOUBLE) ? CUFFT_Z2Z : CUFFT_C2C)); else *status = OSKAR_ERR_INVALID_ARGUMENT; #endif } else if (location & OSKAR_CL) { #ifdef OSKAR_HAVE_OPENCL *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE; #endif } else *status = OSKAR_ERR_BAD_LOCATION; return h; }
/** Constructor. The constructor creates a CUFFT plan with the given size. @param size0 requested size of CUFFT plan @param batch number of 1D transforms */ inline Plan(size_t size0, int batch = 1) { CUFFT_CHECK(cufftPlan1d(&plan, size0, CUFFT_C2R, batch)); }
/** Constructor. The constructor creates a CUFFT plan with the given size. @param size requested size of CUFFT plan @param batch number of 1D transforms */ inline Plan(const Size<1> &size, int batch = 1) { CUFFT_CHECK(cufftPlan1d(&plan, size[0], CUFFT_C2R, batch)); }
void Fastconv_base<D, T, ComplexFmt>::fconv (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel) { size_t kernel_size = (D == 1) ? columns : rows * columns; // allocate device memory and copy input and kernel over from host Device_storage<T> dev_out(rows * columns); Device_storage<T> dev_kernel(kernel_size); Device_storage<T> dev_in(rows * columns); // If the kernel is a matrix, it is assumed to be row-major and dense. // As a result, it can be copied as one contiguous chunk. cudaMemcpy( dev_kernel.data(), kernel, kernel_size * sizeof(T), cudaMemcpyHostToDevice); ASSERT_CUDA_OK(); // Transfer the input (row major, dense) cudaMemcpy( dev_in.data(), in, rows * columns * sizeof(T), cudaMemcpyHostToDevice); ASSERT_CUDA_OK(); // convert pointers to types the CUFFT library accepts typedef cufftComplex ctype; ctype* d_out = reinterpret_cast<ctype*>(dev_out.data()); ctype* d_kernel = reinterpret_cast<ctype*>(dev_kernel.data()); ctype* d_in = reinterpret_cast<ctype*>(dev_in.data()); cufftHandle plan; if (transform_kernel) { // Create a 1D FFT plan and transform the kernel cufftPlan1d(&plan, columns, CUFFT_C2C, 1); cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD); cufftDestroy(plan); } // Create a FFTM plan cufftPlan1d(&plan, columns, CUFFT_C2C, rows); // transform the data cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD); // convolve with kernel, combine with scaling needed for inverse FFT typedef typename impl::Scalar_of<T>::type scalar_type; scalar_type scale = 1 / static_cast<scalar_type>(columns); if (D == 1) vmmuls_row_cc(d_kernel, d_in, d_out, scale, rows, columns); else mmmuls_cc(d_kernel, d_in, d_out, scale, rows, columns); // inverse transform the signal cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE); cufftDestroy(plan); // Move data back to the host from the output buffer cudaMemcpy( out, dev_out.data(), rows * columns * sizeof(T), cudaMemcpyDeviceToHost); ASSERT_CUDA_OK(); }
int main2(int sockfd) { cufftHandle plan; cufftComplex *devPtr; cufftReal indata[NX*BATCH]; cufftComplex data[NX*BATCH]; int i,timer,j,k; char fname[15]; FILE *f; #define BUFSIZE (21*4096*sizeof(int)) int buffer[BUFSIZE]; int p,nread; f = fopen("21-4096","rb"); nread=fread(buffer,BUFSIZE,1,f); printf("nread=%i\n",nread); fclose(f); i=0; for (j=0;j<BATCH;j++) { for (k=0;k<NX;k++) { data[j*NX+k].x = buffer[j*NX+k]; data[j*NX+k].y = 0; } } //f=fopen("y.txt","r"); /* source data creation */ //int sockfd = myconnect(); //printf("connected\n"); /* WORKING!!!!!!!! i=0; for (j=0;j<BATCH;j++) { sprintf(fname,"%i.txt",j); printf("%s\n",fname); f = fopen(fname,"r"); for (k=0;k<NX;k++) { fscanf(f,"%i\n",&p); data[j*NX+k].x = p; data[j*NX+k].y = 0; } fclose(f); */ /* for(i= 0 ; i < NX*BATCH ; i++){ //fscanf(f,"%i\n",&p); //data[i].x= p; data[i].x= 1.0f; //printf("%f\n",data[i].x); data[i].y = 0.0f; } //fclose(f) */ //} /* creates 1D FFT plan */ cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); /* cutCreateTimer(&timer); cutResetTimer(timer); cutStartTimer(timer); */ /* GPU memory allocation */ cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH); /* transfer to GPU memory */ cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice); /* executes FFT processes */ cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD); /* executes FFT processes (inverse transformation) */ //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE); /* transfer results from GPU memory */ cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost); /* deletes CUFFT plan */ cufftDestroy(plan); /* frees GPU memory */ cudaFree(devPtr); /* cudaThreadSynchronize(); cutStopTimer(timer); printf("%f\n",cutGetTimerValue(timer)/(float)1000); cutDeleteTimer(timer); */ /* float mag; for(i = 0 ; i < NX*BATCH ; i++){ //printf("data[%d] %f %f\n", i, data[i].x, data[i].y); //printf("%f\n", data[i].x); mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; printf("%f\n",mag); } */ /* // save as text file float mag; i=0; for (j=0;j<BATCH;j++) { sprintf(fname,"%i-mag.txt",j); printf("%s\n",fname); f = fopen(fname,"w"); for (k=0;k<NX;k++) { //fscanf(f,"%i\n",&p); if (k>50) continue; i = j*NX+k; mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; fprintf(f,"%f\n",mag); } fclose(f); } */ float mag; i=0; float mags[NX]; int magsint[NX*BATCH]; memset(magsint,0,sizeof(int)*NX*BATCH); int u = 0; printf("%f %f %f %f\n",data[0].x,data[1].x,data[2].x,data[3].x); //printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]); // f = fopen("ffts.bin","wb"); for (j=0;j<BATCH;j++) { // sprintf(fname,"%i-bin.dat",j); // printf("%s\n",fname); for (k=0;k<NX;k++) { //fscanf(f,"%i\n",&p); if (k>50) continue; i = j*NX+k; mags[k] = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; magsint[u]=mags[k] ; u++; //fprintf(f,"%f\n",mag); } //f = fopen(fname,"wb"); // fwrite(magsint,sizeof(int)*50,1,f); } int n; n = write(sockfd,magsint,sizeof(int)*BATCH*50); printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]); printf("send ok, size: %i\n",n); //fclose(f); return 0; }
int main(int argc, char *argv[]) { int i; struct timeval begin, end; int size; size_t bytes; int n = 0, m = 0; STARPUFFT(plan) plan; #ifdef STARPU_HAVE_FFTW _FFTW(plan) fftw_plan; #endif #ifdef STARPU_USE_CUDA cufftHandle cuda_plan; cudaError_t cures; #endif double timing; if (argc < 2 || argc > 3) { fprintf(stderr,"need one or two size of vector\n"); exit(EXIT_FAILURE); } starpu_init(NULL); if (argc == 2) { n = atoi(argv[1]); /* 1D */ size = n; } else if (argc == 3) { n = atoi(argv[1]); m = atoi(argv[2]); /* 2D */ size = n * m; } else { assert(0); } bytes = size * sizeof(STARPUFFT(complex)); STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in)); starpu_srand48(0); for (i = 0; i < size; i++) in[i] = starpu_drand48() + I * starpu_drand48(); STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out)); #ifdef STARPU_HAVE_FFTW STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw)); #endif #ifdef STARPU_USE_CUDA STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda)); #endif if (argc == 2) { plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS) printf("erf\n"); #endif } else if (argc == 3) { plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS); #endif } else { assert(0); } #ifdef STARPU_HAVE_FFTW gettimeofday(&begin, NULL); _FFTW(execute)(fftw_plan); gettimeofday(&end, NULL); _FFTW(destroy_plan)(fftw_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif #ifdef STARPU_USE_CUDA gettimeofday(&begin, NULL); if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS) printf("erf2\n"); if ((cures = cudaThreadSynchronize()) != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures); gettimeofday(&end, NULL); cufftDestroy(cuda_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif STARPUFFT(execute)(plan, in, out); STARPUFFT(showstats)(stdout); STARPUFFT(destroy_plan)(plan); printf("\n"); #if 0 for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(in[i]), creal(in[i])); printf("\n\n"); for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out[i]), creal(out[i])); printf("\n\n"); #ifdef STARPU_HAVE_FFTW for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i])); printf("\n\n"); #endif #endif #ifdef STARPU_HAVE_FFTW { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif #ifdef STARPU_USE_CUDA { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out_cuda[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif STARPUFFT(free)(in); STARPUFFT(free)(out); #ifdef STARPU_HAVE_FFTW STARPUFFT(free)(out_fftw); #endif #ifdef STARPU_USE_CUDA free(out_cuda); #endif starpu_shutdown(); return EXIT_SUCCESS; }
int main(void) { //std::cout << "Generating a time series on device "<< tim.get_nsamps() << std::endl; //DeviceTimeSeries<float> d_tim(8388608); //d_tim.set_tsamp(0.000064); TimeSeries<float> tim; tim.from_file("/lustre/home/ebarr/Soft/peasoup/tmp5.tim"); DeviceTimeSeries<float> d_tim(tim); unsigned int size = d_tim.get_nsamps(); TimeSeriesFolder folder(size); //DeviceTimeSeries<float> d_tim_r(fft_size); //<----for resampled data //TimeDomainResampler resampler; float* folded_buffer; cudaError_t error; cufftResult result; error = cudaMalloc((void**)&folded_buffer, sizeof(float)*size); ErrorChecker::check_cuda_error(error); unsigned nints = 64; unsigned nbins = 32; cufftComplex* fft_out; error = cudaMalloc((void**)&fft_out, sizeof(cufftComplex)*nints*nbins); cufftHandle plan; result = cufftPlan1d(&plan,nbins,CUFFT_R2C, nints); ErrorChecker::check_cufft_error(result); Stopwatch timer; FoldedSubints<float> folded_array(nbins,nints); //folder.fold(d_tim,folded_array,0.007453079228); std::cout << "made it here" << std::endl; FoldOptimiser optimiser(nbins,nints); timer.start(); for (int ii=0;ii<1;ii++){ //FoldedSubints<float> folded_array(nbins,nints); folder.fold(d_tim,folded_array,0.007453099228); Utils::dump_device_buffer<float>(folded_array.get_data(),nints*nbins,"original_fold.bin"); optimiser.optimise(folded_array); } timer.stop(); /* float* temp = new float [nints*nbins]; cudaMemcpy(temp,folded_buffer,nints*nbins*sizeof(float),cudaMemcpyDeviceToHost); ErrorChecker::check_cuda_error(); for (int ii=0;ii<nints*nbins;ii++) std::cout << temp[ii] << std::endl; */ std::cout << "Total execution time (s): " << timer.getTime()<<std::endl; std::cout << "Average execution time (s): " << timer.getTime()/1000.0 << std::endl; return 0; }
cufftResult WINAPI wine_cufftPlan1d( cufftHandle *plan, int nx, cufftType type, int batch ){ WINE_TRACE("\n"); return cufftPlan1d( plan, nx, type, batch ); }
void generate_PL_lines_fft(double PLindex, int nr_lines,int linelength, icl_buffer *y_buffer, //double *y, // out int fielddim_z) { cufftHandle fftw_plan_transfer_fw, fftw_plan_noise_fw, fftw_plan_transfer_bw; // FFT preparation const size_t out_size = sizeof(cufftDoubleComplex)*(linelength/2+1); const size_t in_size = sizeof(double)*(linelength+1); // memory allocation double *noise_in_host = (double*) malloc(in_size); // all these allocs can technically also be done outside of the function, if performance is bad this might be a consideration double *noise_in_device; checkCudaErrors(cudaMalloc((void **)&noise_in_device, in_size)); cufftDoubleComplex *noise_out_host = (cufftDoubleComplex*) fftw_malloc(out_size); // check if i need 2 times +1 cufftDoubleComplex *noise_out_device; checkCudaErrors(cudaMalloc((void **)&noise_out_device, out_size)); double *transfer_in_host = (double*) malloc(in_size); double *transfer_in_device; checkCudaErrors(cudaMalloc((void **)&transfer_in_device, in_size)); cufftDoubleComplex *transfer_out_host = (cufftDoubleComplex*) fftw_malloc(out_size); cufftDoubleComplex *transfer_out_device; checkCudaErrors(cudaMalloc((void **)&transfer_out_device, out_size)); // plans cufftPlan1d(&fftw_plan_transfer_fw, linelength, CUFFT_D2Z, 1); cufftPlan1d(&fftw_plan_noise_fw, linelength, CUFFT_D2Z, 1); cufftPlan1d(&fftw_plan_transfer_bw, linelength, CUFFT_Z2D, 1); // XXX todo here we can potentially cufftPlanMany, to performa a batch of many plan1d fft /* fftw_plan_transfer_fw = fftw_plan_dft_r2c_1d(linelength, transfer_in, transfer_out,FFTW_MEASURE); fftw_plan_noise_fw = fftw_plan_dft_r2c_1d(linelength, noise_in, noise_out,FFTW_MEASURE); fftw_plan_transfer_bw = fftw_plan_dft_c2r_1d(linelength, transfer_out, transfer_in,FFTW_MEASURE); */ for(int v=0; v<nr_lines; v++){ for(int i=0; i<linelength; i++){ noise_in_host[i] = rand_01(); transfer_in_host[i] = rand_01(); } Box_Mueller(linelength, noise_in_host, transfer_in_host); for(int i=0; i<linelength; i++){ noise_in_host[i] = 2*noise_in_host[i]-1; // around 0 noise_in_host[i] = 5*noise_in_host[i]; //changes the deviation, which values need to be put will be investigated } transfer_in_host[0]=1.0; for(int i=1; i<linelength; i++){ transfer_in_host[i] = (transfer_in_host[i-1]/(i))*(i-1-(PLindex/2.0)); } /// (a) moving transfer_in and noise_in to the device cudaMemcpy(noise_in_device, noise_in_host, in_size, cudaMemcpyDeviceToHost); cudaMemcpy(transfer_in_device, transfer_in_host, in_size, cudaMemcpyDeviceToHost); //fftw_execute(fftw_plan_noise_fw); //fftw_execute(fftw_plan_transfer_fw); cufftExecD2Z(fftw_plan_noise_fw, transfer_in_device, transfer_out_device); cufftExecD2Z(fftw_plan_noise_fw, noise_in_device, noise_out_device); /// (b) moving back transfer_out and noise out cudaMemcpy(transfer_out_host, transfer_out_device, out_size, cudaMemcpyHostToDevice); cudaMemcpy(noise_out_host, noise_out_device, out_size, cudaMemcpyHostToDevice); for(int i=0; i<0.5*linelength+1; i++){ double temp = (transfer_out_host[i].x*noise_out_host[i].x+transfer_out_host[i].y*noise_out_host[i].y) / linelength; transfer_out_host[i].y = (transfer_out_host[i].x*noise_out_host[i].y-transfer_out_host[i].y*noise_out_host[i].x) / linelength; transfer_out_host[i].x = temp; } /// (c) moving to the device transfer_out cudaMemcpy(transfer_out_device, transfer_out_host, out_size, cudaMemcpyDeviceToHost); // fftw_execute(fftw_plan_transfer_bw); cufftExecZ2D(fftw_plan_transfer_bw, transfer_out_device, transfer_in_device); //// (d) moving back transfer_in cudaMemcpy(transfer_in_host, transfer_in_device, in_size, cudaMemcpyHostToDevice); for(int i=0; i<linelength; i++){ transfer_in_host[i] = transfer_in_host[i]/sqrt((double) linelength); } // xxx reduce max/avg double average=0; for(int i=0; i<linelength; i++){ average = average + transfer_in_host[i]; } average = average/linelength; double *y_host = (double*)malloc(sizeof(cl_double) * linelength * nr_lines + 1); size_t index = 0; for(int i=0; i<linelength; i++){ y_host[index]=(transfer_in_host[i]-average); index++; } /// (e) write y_buf //icl_local_device* ldev = &local_devices[y_buffer->device->device_id]; icl_local_buffer* lbuf = (icl_local_buffer*)(y_buffer->buffer_add); cudaMemcpy((void*)lbuf->mem, transfer_out_host, out_size, cudaMemcpyDeviceToHost); } free(transfer_in_host); free(transfer_out_host); free(noise_out_host); free(noise_in_host); cudaFree(transfer_in_device); cudaFree(transfer_out_device); cudaFree(noise_out_device); cudaFree(noise_in_device); cufftDestroy(fftw_plan_transfer_fw); cufftDestroy(fftw_plan_transfer_bw); cufftDestroy(fftw_plan_noise_fw); // fftw_cleanup(); }