/*! computes the N-point DFT of signal X and stores in Y using CUDA's FFT library */ bool cuda_dft(cuComplex *Y, cuComplex *X, float scale, int N) { size_t bytes = (size_t)N * sizeof(cuComplex); cuComplex *Y_gpu, *X_gpu; cudaMalloc((void **)&Y_gpu, bytes); cudaMalloc((void **)&X_gpu, bytes); cudaMemcpy(Y_gpu, Y, bytes, cudaMemcpyHostToDevice); cudaMemcpy(X_gpu, X, bytes, cudaMemcpyHostToDevice); cufftHandle plan; cufftPlan1d(&plan, N, CUFFT_C2C, 1); cufftExecC2C(plan, X_gpu, Y_gpu, CUFFT_FORWARD); cufftDestroy(plan); cudaMemcpy(Y, Y_gpu, bytes, cudaMemcpyDeviceToHost); cudaFree(Y_gpu); cudaFree(X_gpu); for (int n = 0; n < N; n++) { Y[n].x *= scale; Y[n].y *= scale; } return true; }
/* * Class: jcuda_jcufft_JCufft * Method: cufftExecC2CNative * Signature: (Ljcuda/jcufft/cufftHandle;Ljcuda/Pointer;Ljcuda/Pointer;I)I */ JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftExecC2CNative (JNIEnv *env, jclass cla, jobject handle, jobject cIdata, jobject cOdata, jint direction) { if (handle == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftExecC2C"); return JCUFFT_INTERNAL_ERROR; } if (cIdata == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cIdata' is null for cufftExecC2C"); return JCUFFT_INTERNAL_ERROR; } if (cOdata == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cOdata' is null for cufftExecC2C"); return JCUFFT_INTERNAL_ERROR; } Logger::log(LOG_TRACE, "Executing cufftExecC2C\n"); cufftHandle nativePlan = env->GetIntField(handle, cufftHandle_plan); cufftComplex* nativeCIData = (cufftComplex*)getPointer(env, cIdata); cufftComplex* nativeCOData = (cufftComplex*)getPointer(env, cOdata); cufftResult result = cufftExecC2C(nativePlan, nativeCIData, nativeCOData, direction); return result; }
extern "C" void cuda_fft(float *d_data, int Lx, int Ny, void *stream) { cufftHandle plan; cufftPlan1d(&plan, Lx, CUFFT_C2C, Ny); cufftSetStream(plan, (cudaStream_t)stream); cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD); cufftDestroy(plan); }
void g_fwdfft(QSP_ARG_DECL Data_Obj *dst_dp, Data_Obj *src1_dp) { //Variable declarations int NX = 256; //int BATCH = 10; int BATCH = 1; cufftResult_t status; //Declare plan for FFT cufftHandle plan; //cufftComplex *data; //cufftComplex *result; void *data; void *result; cudaError_t drv_err; //Allocate RAM //cutilSafeCall(cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH)); //cutilSafeCall(cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH)); drv_err = cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH); if( drv_err != cudaSuccess ){ WARN("error allocating cuda data buffer for fft!?"); return; } drv_err = cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH); if( drv_err != cudaSuccess ){ WARN("error allocating cuda result buffer for fft!?"); // BUG clean up previous malloc... return; } //Create plan for FFT status = cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftPlan1d: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); } //Run forward fft on data status = cufftExecC2C(plan, (cufftComplex *)data, (cufftComplex *)result, CUFFT_FORWARD); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); } //Run inverse fft on data /*status = cufftExecC2C(plan, data, result, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); }*/ //Free resources cufftDestroy(plan); cudaFree(data); }
/* * Excecute a local 1D FFT */ int dfft_cuda_local_fft( cuda_cpx_t *in, cuda_cpx_t *out, cuda_plan_t p, int dir) { cufftResult res; res = cufftExecC2C(p, in, out, dir ? CUFFT_INVERSE : CUFFT_FORWARD); return res; }
void oskar_fft_exec(oskar_FFT* h, oskar_Mem* data, int* status) { oskar_Mem *data_copy = 0, *data_ptr = data; if (oskar_mem_location(data) != h->location) { data_copy = oskar_mem_create_copy(data, h->location, status); data_ptr = data_copy; } if (h->location == OSKAR_CPU) { if (h->num_dim == 1) { *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE; } else if (h->num_dim == 2) { if (h->precision == OSKAR_DOUBLE) oskar_fftpack_cfft2f(h->dim_size, h->dim_size, h->dim_size, oskar_mem_double(data_ptr, status), oskar_mem_double(h->fftpack_wsave, status), oskar_mem_double(h->fftpack_work, status)); else oskar_fftpack_cfft2f_f(h->dim_size, h->dim_size, h->dim_size, oskar_mem_float(data_ptr, status), oskar_mem_float(h->fftpack_wsave, status), oskar_mem_float(h->fftpack_work, status)); /* This step not needed for W-kernel generation, so turn it off. */ if (h->ensure_consistent_norm) oskar_mem_scale_real(data_ptr, (double)h->num_cells_total, 0, h->num_cells_total, status); } } else if (h->location == OSKAR_GPU) { #ifdef OSKAR_HAVE_CUDA if (h->precision == OSKAR_DOUBLE) cufftExecZ2Z(h->cufft_plan, (cufftDoubleComplex*) oskar_mem_void(data_ptr), (cufftDoubleComplex*) oskar_mem_void(data_ptr), CUFFT_FORWARD); else cufftExecC2C(h->cufft_plan, (cufftComplex*) oskar_mem_void(data_ptr), (cufftComplex*) oskar_mem_void(data_ptr), CUFFT_FORWARD); #endif } else *status = OSKAR_ERR_BAD_LOCATION; if (oskar_mem_location(data) != h->location) oskar_mem_copy(data, data_ptr, status); oskar_mem_free(data_copy, status); }
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream) { cufftHandle plan; cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL); if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_C2C)!=CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: Plan creation failed"); } cufftSetStream(plan, (cudaStream_t) stream); cufftExecC2C(plan, (cufftComplex*) d_data, (cufftComplex*) d_data, CUFFT_FORWARD); cufftDestroy(plan); }
void Fastconv_base<D, T, C>::fconv (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel) { // convert pointers to types the CUFFT library accepts typedef cufftComplex ctype; ctype* d_out = reinterpret_cast<ctype*>(out); ctype* d_kernel = const_cast<ctype*>(reinterpret_cast<ctype const*>(kernel)); ctype* d_in = const_cast<ctype*>(reinterpret_cast<ctype const*>(in)); cufftHandle plan; if (transform_kernel) { // Create a 1D FFT plan and transform the kernel cufftPlan1d(&plan, columns, CUFFT_C2C, 1); cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD); cufftDestroy(plan); } // Create a FFTM plan cufftPlan1d(&plan, columns, CUFFT_C2C, rows); // transform the data cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD); // convolve with kernel, combine with scaling needed for inverse FFT typedef typename impl::scalar_of<T>::type scalar_type; scalar_type scale = 1 / static_cast<scalar_type>(columns); if (D == 1) vmmuls_row(kernel, in, out, scale, rows, columns); else mmmuls(kernel, in, out, scale, rows, columns); // inverse transform the signal cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE); cufftDestroy(plan); }
static cufftResult_t centerdifft(const std::complex<float>* in_mat, std::complex<float>* out_mat, cufftHandle* fftplan) { cufftResult_t cufftResult; cufftResult = cufftExecC2C(*fftplan, (cufftComplex*)in_mat, (cufftComplex*)out_mat, CUFFT_INVERSE); AGILE_ASSERT(result == CUFFT_SUCCESS, StandardException::ExceptionMessage( "Error during FFT procedure")); return cufftResult; }
void inverse(const GPUMatrixPitched<TSpectralType>& fft_x, GPUMatrixPitched<TSignalType>& x) { // assert that we have no zero-padded lines AGILE_ASSERT(x.getNumColumns() == x.getPitchElements(), StandardException::ExceptionMessage( "FFT of non-aligned matrices not implemented, yet")); // create a plan createPlan(x.getNumRows(), x.getNumColumns()); cufftResult result = cufftExecC2C( m_plan, (typename to_cufft_type<TSpectralType>::type*)fft_x.data(), (typename to_cufft_type<TSignalType>::type*)x.data(), CUFFT_INVERSE); AGILE_ASSERT(result == CUFFT_SUCCESS, StandardException::ExceptionMessage( "Error during FFT procedure")); scale(TSignalType(1./(fft_x.getNumRows() * fft_x.getNumColumns())), x, x); }
void oskar_imager_finalise_plane(oskar_Imager* h, oskar_Mem* plane, double plane_norm, int* status) { int size, num_cells; DeviceData* d; if (*status) return; /* Apply normalisation. */ if (plane_norm > 0.0 || plane_norm < 0.0) oskar_mem_scale_real(plane, 1.0 / plane_norm, status); if (h->algorithm == OSKAR_ALGORITHM_DFT_2D || h->algorithm == OSKAR_ALGORITHM_DFT_3D) return; /* Check plane is complex type, as plane must be gridded visibilities. */ if (!oskar_mem_is_complex(plane)) { *status = OSKAR_ERR_TYPE_MISMATCH; return; } /* Make image using FFT and apply grid correction. */ size = h->grid_size; num_cells = size * size; d = &h->d[0]; if (oskar_mem_precision(plane) == OSKAR_DOUBLE) { oskar_fftphase_cd(size, size, oskar_mem_double(plane, status)); if (h->fft_on_gpu) { #ifdef OSKAR_HAVE_CUDA oskar_device_set(h->cuda_device_ids[0], status); oskar_mem_copy(d->plane_gpu, plane, status); cufftExecZ2Z(h->cufft_plan, oskar_mem_void(d->plane_gpu), oskar_mem_void(d->plane_gpu), CUFFT_FORWARD); oskar_mem_copy(plane, d->plane_gpu, status); #else *status = OSKAR_ERR_CUDA_NOT_AVAILABLE; #endif } else { oskar_fftpack_cfft2f(size, size, size, oskar_mem_double(plane, status), oskar_mem_double(h->fftpack_wsave, status), oskar_mem_double(h->fftpack_work, status)); oskar_mem_scale_real(plane, (double)num_cells, status); } oskar_fftphase_cd(size, size, oskar_mem_double(plane, status)); oskar_grid_correction_d(size, oskar_mem_double(h->corr_func, status), oskar_mem_double(plane, status)); } else { oskar_fftphase_cf(size, size, oskar_mem_float(plane, status)); if (h->fft_on_gpu) { #ifdef OSKAR_HAVE_CUDA oskar_device_set(h->cuda_device_ids[0], status); oskar_mem_copy(d->plane_gpu, plane, status); cufftExecC2C(h->cufft_plan, oskar_mem_void(d->plane_gpu), oskar_mem_void(d->plane_gpu), CUFFT_FORWARD); oskar_mem_copy(plane, d->plane_gpu, status); #else *status = OSKAR_ERR_CUDA_NOT_AVAILABLE; #endif } else { oskar_fftpack_cfft2f_f(size, size, size, oskar_mem_float(plane, status), oskar_mem_float(h->fftpack_wsave, status), oskar_mem_float(h->fftpack_work, status)); oskar_mem_scale_real(plane, (double)num_cells, status); } oskar_fftphase_cf(size, size, oskar_mem_float(plane, status)); oskar_grid_correction_f(size, oskar_mem_double(h->corr_func, status), oskar_mem_float(plane, status)); } }
int main(int argc, char *argv[]) { int i; struct timeval begin, end; int size; size_t bytes; int n = 0, m = 0; STARPUFFT(plan) plan; #ifdef STARPU_HAVE_FFTW _FFTW(plan) fftw_plan; #endif #ifdef STARPU_USE_CUDA cufftHandle cuda_plan; cudaError_t cures; #endif double timing; if (argc < 2 || argc > 3) { fprintf(stderr,"need one or two size of vector\n"); exit(EXIT_FAILURE); } starpu_init(NULL); if (argc == 2) { n = atoi(argv[1]); /* 1D */ size = n; } else if (argc == 3) { n = atoi(argv[1]); m = atoi(argv[2]); /* 2D */ size = n * m; } else { assert(0); } bytes = size * sizeof(STARPUFFT(complex)); STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in)); starpu_srand48(0); for (i = 0; i < size; i++) in[i] = starpu_drand48() + I * starpu_drand48(); STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out)); #ifdef STARPU_HAVE_FFTW STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw)); #endif #ifdef STARPU_USE_CUDA STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda)); #endif if (argc == 2) { plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS) printf("erf\n"); #endif } else if (argc == 3) { plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS); #endif } else { assert(0); } #ifdef STARPU_HAVE_FFTW gettimeofday(&begin, NULL); _FFTW(execute)(fftw_plan); gettimeofday(&end, NULL); _FFTW(destroy_plan)(fftw_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif #ifdef STARPU_USE_CUDA gettimeofday(&begin, NULL); if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS) printf("erf2\n"); if ((cures = cudaThreadSynchronize()) != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures); gettimeofday(&end, NULL); cufftDestroy(cuda_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif STARPUFFT(execute)(plan, in, out); STARPUFFT(showstats)(stdout); STARPUFFT(destroy_plan)(plan); printf("\n"); #if 0 for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(in[i]), creal(in[i])); printf("\n\n"); for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out[i]), creal(out[i])); printf("\n\n"); #ifdef STARPU_HAVE_FFTW for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i])); printf("\n\n"); #endif #endif #ifdef STARPU_HAVE_FFTW { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif #ifdef STARPU_USE_CUDA { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out_cuda[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif STARPUFFT(free)(in); STARPUFFT(free)(out); #ifdef STARPU_HAVE_FFTW STARPUFFT(free)(out_fftw); #endif #ifdef STARPU_USE_CUDA free(out_cuda); #endif starpu_shutdown(); return EXIT_SUCCESS; }
bool test2(void) { float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; float *d_Data, *d_Kernel, *d_PaddedData, *d_PaddedKernel; fComplex *d_DataSpectrum0, *d_KernelSpectrum0; cufftHandle fftPlan; bool bRetVal; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); printf("Testing updated custom R2C / C2R FFT-based convolution\n"); const int kernelH = 7; const int kernelW = 6; const int kernelY = 3; const int kernelX = 4; const int dataH = 2000; const int dataW = 2000; const int fftH = snapTransformSize(dataH + kernelH - 1); const int fftW = snapTransformSize(dataW + kernelW - 1); printf("...allocating memory\n"); h_Data = (float *)malloc(dataH * dataW * sizeof(float)); h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); printf("...generating random input data\n"); srand(2010); for (int i = 0; i < dataH * dataW; i++) { h_Data[i] = getRand(); } for (int i = 0; i < kernelH * kernelW; i++) { h_Kernel[i] = getRand(); } printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2); checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C)); printf("...uploading to GPU and padding convolution kernel and input data\n"); checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); padKernel( d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX ); //CUFFT_INVERSE works just as well... const int FFT_DIR = CUFFT_FORWARD; //Not including kernel transformation into time measurement, //since convolution kernel is not changed very frequently printf("...transforming convolution kernel\n"); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR)); printf("...running GPU FFT convolution: "); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR)); spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, fftW / 2, FFT_DIR); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer); printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); printf("...reading back GPU FFT results\n"); checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); printf("...running reference CPU convolution\n"); convolutionClampToBorderCPU( h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); printf("...comparing the results: "); double sum_delta2 = 0; double sum_ref2 = 0; double max_delta_ref = 0; for (int y = 0; y < dataH; y++) { for (int x = 0; x < dataW; x++) { double rCPU = (double)h_ResultCPU[y * dataW + x]; double rGPU = (double)h_ResultGPU[y * fftW + x]; double delta = (rCPU - rGPU) * (rCPU - rGPU); double ref = rCPU * rCPU + rCPU * rCPU; if ((delta / ref) > max_delta_ref) { max_delta_ref = delta / ref; } sum_delta2 += delta; sum_ref2 += ref; } } double L2norm = sqrt(sum_delta2 / sum_ref2); printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); bRetVal = (L2norm < 1e-6) ? true : false; printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); printf("...shutting down\n"); sdkStartTimer(&hTimer); checkCudaErrors(cufftDestroy(fftPlan)); checkCudaErrors(cudaFree(d_KernelSpectrum0)); checkCudaErrors(cudaFree(d_DataSpectrum0)); checkCudaErrors(cudaFree(d_PaddedKernel)); checkCudaErrors(cudaFree(d_PaddedData)); checkCudaErrors(cudaFree(d_Kernel)); checkCudaErrors(cudaFree(d_Data)); free(h_ResultGPU); free(h_ResultCPU); free(h_Kernel); free(h_Data); return bRetVal; }
/** * Execute single precision C2C plan. This function is blocking and only returns after the transform is completed. * @note For inplace transforms, data_out should point to the same memory address as data, AND * the plan must have been created as inplace. * @param plan FFT plan created by \ref accfft_plan_dft_3d_r2cf. * @param data Input data in frequency domain. * @param data_out Output data in frequency domain. * @param timer See \ref timer for more details. * @param XYZ a bit set field that determines which directions FFT should be executed */ void accfft_execute_c2c_gpuf(accfft_plan_gpuf* plan, int direction,Complexf * data_d, Complexf * data_out_d, double * timer,std::bitset<3> xyz){ if(!plan->c2c_plan_baked){ if(plan->procid==0) std::cout<<"Error. r2c plan has not been made correctly. Please first create the plan before calling execute functions."<<std::endl; return; } if(data_d==NULL) data_d=plan->data_c; if(data_out_d==NULL) data_out_d=plan->data_out_c; int * coords=plan->coord; int procid=plan->procid; double fft_time=0; double timings[5]={0}; int NY=plan->N[1]; cudaEvent_t memcpy_startEvent, memcpy_stopEvent; cudaEvent_t fft_startEvent, fft_stopEvent; checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) ); checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) ); checkCuda_accfft( cudaEventCreate(&fft_startEvent) ); checkCuda_accfft( cudaEventCreate(&fft_stopEvent) ); cufftResult_t cufft_error; float dummy_time=0; int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1; int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i; int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; if(direction==-1){ /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // FFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft(cufftExecC2C(plan->fplan_0,(cufftComplex*)data_d, (cufftComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; if(!plan->oneD){ plan->T_plan_1->execute_gpu(plan->T_plan_1,(float*)data_out_d,timings,2,osize_0[0],coords[0]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1[0];++i){ checkCuda_accfft(cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_out_d[i*osize_1[1]*osize_1[2]], (cufftComplex*)&data_out_d[i*osize_1[1]*osize_1[2]],CUFFT_FORWARD)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } MPI_Barrier(plan->c_comm); if(plan->oneD){ plan->T_plan_2->execute_gpu(plan->T_plan_2,(float*)data_out_d,timings,2); } else{ plan->T_plan_2->execute_gpu(plan->T_plan_2,(float*)data_out_d,timings,2,1,coords[1]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0 x N1/P0 x N2/P1 **********************/ /**************************************************************/ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft(cufftExecC2C(plan->fplan_2,(cufftComplex*)data_out_d, (cufftComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } } else if (direction==1){ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_d, (cufftComplex*)data_d,CUFFT_INVERSE)); checkCuda_accfft (cudaDeviceSynchronize()); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(plan->oneD){ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1); } else{ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1,1,coords[1]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1i[0];++i){ checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_d[i*NY*osize_1i[2]], (cufftComplex*)&data_d[i*NY*osize_1i[2]],CUFFT_INVERSE)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } MPI_Barrier(plan->c_comm); if(!plan->oneD){ plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(float*)data_d,timings,1,osize_1i[0],coords[0]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecC2C(plan->fplan_0,(cufftComplex*)data_d,(cufftComplex*)data_out_d,CUFFT_INVERSE)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; } timings[4]+=fft_time; if(timer==NULL){ //delete [] timings; } else{ timer[0]+=timings[0]; timer[1]+=timings[1]; timer[2]+=timings[2]; timer[3]+=timings[3]; timer[4]+=timings[4]; } MPI_Barrier(plan->c_comm); return; }
void accfft_execute_gpuf(accfft_plan_gpuf* plan, int direction,float * data_d, float * data_out_d, double * timer,std::bitset<3> xyz){ if(data_d==NULL) data_d=plan->data; if(data_out_d==NULL) data_out_d=plan->data_out; int * coords=plan->coord; int procid=plan->procid; double fft_time=0; double timings[5]={0}; cudaEvent_t memcpy_startEvent, memcpy_stopEvent; cudaEvent_t fft_startEvent, fft_stopEvent; checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) ); checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) ); checkCuda_accfft( cudaEventCreate(&fft_startEvent) ); checkCuda_accfft( cudaEventCreate(&fft_stopEvent) ); int NY=plan->N[1]; float dummy_time=0; int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1; //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i; //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; if(direction==-1){ /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // FFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecR2C(plan->fplan_0,(cufftReal*)data_d, (cufftComplex*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; // Perform N0/P0 transpose if(!plan->oneD){ plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]); } /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1[0];++i){ checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(plan->oneD){ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2); } else{ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]); } /**************************************************************/ /******************* N0 x N1/P0 x N2/P1 **********************/ /**************************************************************/ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_out_d, (cufftComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } } else if (direction==1){ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_d, (cufftComplex*)data_d,CUFFT_INVERSE)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(plan->oneD){ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1); } else{ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1,1,coords[1]); } /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1i[0];++i){ checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(!plan->oneD){ plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(float*)data_d,timings,1,osize_1i[0],coords[0]); } MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // IFFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecC2R(plan->iplan_0,(cufftComplex*)data_d,(cufftReal*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; } timings[4]+=fft_time; if(timer==NULL){ //delete [] timings; } else{ timer[0]+=timings[0]; timer[1]+=timings[1]; timer[2]+=timings[2]; timer[3]+=timings[3]; timer[4]+=timings[4]; } MPI_Barrier(plan->c_comm); return; }
void WorkerThread::createNewFilter() { // Free GPU memory from current filter and CUFFT cudaFree(_gabor_data); cudaFree(_gpu_image_0); cudaFree(_gpu_image_1); cufftDestroy(_fft_plan); float* gaussian_data; cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels); int2 gaussian_size; gaussian_size.x = _filter_size; gaussian_size.y = _filter_size; int2 gaussian_center; gaussian_center.x = _filter_size / 2; gaussian_center.y = _filter_size / 2; gaussian(gaussian_data, _new_theta, _new_sigma, 1.0, gaussian_center, gaussian_size); float* harmonic_data; cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2); int2 harmonic_size; harmonic_size.x = _filter_size; harmonic_size.y = _filter_size; int2 harmonic_center; harmonic_center.x = _filter_size / 2; harmonic_center.y = _filter_size / 2; harmonic(harmonic_data, _new_theta, _new_lambda, _new_psi, harmonic_center, harmonic_size); float* host_harmonic = new float[_filter_size * _filter_size * 2]; cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2); int2 gabor_size; gabor_size.x = _filter_size; gabor_size.y = _filter_size; int2 gabor_center; gabor_center.x = _filter_size / 2; gabor_center.y = _filter_size / 2; multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size); float* host_gabor_data = new float[_filter_pixels * 2]; cudaMemcpy(host_gabor_data, _gabor_data, sizeof(float) * _filter_pixels * 2, cudaMemcpyDeviceToHost); //pad the filter { float* data = host_gabor_data; float* target = _filter_image; memset(target, 0, sizeof(float) * _padded_pixels * 2); int padded_stride = 2 * _padded_size; int target_stride = 2 * _target_size; for (int i = 0; i < _target_size; ++i) { memcpy(target, data, sizeof(float) * target_stride); target += padded_stride; data += target_stride; } } // Copy gabor data into member for texture creation _filter_image_mutex.lock(); memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2); _filter_image_mutex.unlock(); cudaFree(_gabor_data); cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2); cudaMemcpy(_gabor_data, _filter_image, sizeof(float) * _padded_pixels * 2, cudaMemcpyHostToDevice); cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C); cufftExecC2C(_fft_plan, (cufftComplex*)(_gabor_data), (cufftComplex*)(_gabor_data), CUFFT_FORWARD); cudaMemcpy(_filter_image, _gabor_data, sizeof(float) * _padded_pixels * 2, cudaMemcpyDeviceToHost); // Free temporary GPU memory used for creation of filter cudaFree(gaussian_data); cudaFree(harmonic_data); delete host_harmonic; delete host_gabor_data; _should_create_new_filter = false; emit newFilterImage(); }
int main2(int sockfd) { cufftHandle plan; cufftComplex *devPtr; cufftReal indata[NX*BATCH]; cufftComplex data[NX*BATCH]; int i,timer,j,k; char fname[15]; FILE *f; #define BUFSIZE (21*4096*sizeof(int)) int buffer[BUFSIZE]; int p,nread; f = fopen("21-4096","rb"); nread=fread(buffer,BUFSIZE,1,f); printf("nread=%i\n",nread); fclose(f); i=0; for (j=0;j<BATCH;j++) { for (k=0;k<NX;k++) { data[j*NX+k].x = buffer[j*NX+k]; data[j*NX+k].y = 0; } } //f=fopen("y.txt","r"); /* source data creation */ //int sockfd = myconnect(); //printf("connected\n"); /* WORKING!!!!!!!! i=0; for (j=0;j<BATCH;j++) { sprintf(fname,"%i.txt",j); printf("%s\n",fname); f = fopen(fname,"r"); for (k=0;k<NX;k++) { fscanf(f,"%i\n",&p); data[j*NX+k].x = p; data[j*NX+k].y = 0; } fclose(f); */ /* for(i= 0 ; i < NX*BATCH ; i++){ //fscanf(f,"%i\n",&p); //data[i].x= p; data[i].x= 1.0f; //printf("%f\n",data[i].x); data[i].y = 0.0f; } //fclose(f) */ //} /* creates 1D FFT plan */ cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); /* cutCreateTimer(&timer); cutResetTimer(timer); cutStartTimer(timer); */ /* GPU memory allocation */ cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH); /* transfer to GPU memory */ cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice); /* executes FFT processes */ cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD); /* executes FFT processes (inverse transformation) */ //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE); /* transfer results from GPU memory */ cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost); /* deletes CUFFT plan */ cufftDestroy(plan); /* frees GPU memory */ cudaFree(devPtr); /* cudaThreadSynchronize(); cutStopTimer(timer); printf("%f\n",cutGetTimerValue(timer)/(float)1000); cutDeleteTimer(timer); */ /* float mag; for(i = 0 ; i < NX*BATCH ; i++){ //printf("data[%d] %f %f\n", i, data[i].x, data[i].y); //printf("%f\n", data[i].x); mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; printf("%f\n",mag); } */ /* // save as text file float mag; i=0; for (j=0;j<BATCH;j++) { sprintf(fname,"%i-mag.txt",j); printf("%s\n",fname); f = fopen(fname,"w"); for (k=0;k<NX;k++) { //fscanf(f,"%i\n",&p); if (k>50) continue; i = j*NX+k; mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; fprintf(f,"%f\n",mag); } fclose(f); } */ float mag; i=0; float mags[NX]; int magsint[NX*BATCH]; memset(magsint,0,sizeof(int)*NX*BATCH); int u = 0; printf("%f %f %f %f\n",data[0].x,data[1].x,data[2].x,data[3].x); //printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]); // f = fopen("ffts.bin","wb"); for (j=0;j<BATCH;j++) { // sprintf(fname,"%i-bin.dat",j); // printf("%s\n",fname); for (k=0;k<NX;k++) { //fscanf(f,"%i\n",&p); if (k>50) continue; i = j*NX+k; mags[k] = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; magsint[u]=mags[k] ; u++; //fprintf(f,"%f\n",mag); } //f = fopen(fname,"wb"); // fwrite(magsint,sizeof(int)*50,1,f); } int n; n = write(sockfd,magsint,sizeof(int)*BATCH*50); printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]); printf("send ok, size: %i\n",n); //fclose(f); return 0; }
void WorkerThread::createInitialFilter() { float* gaussian_data; cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels); int2 gaussian_size; gaussian_size.x = _filter_size; gaussian_size.y = _filter_size; int2 gaussian_center; gaussian_center.x = _filter_size / 2; gaussian_center.y = _filter_size / 2; gaussian(gaussian_data, 0.0, _sigma, 1.0, gaussian_center, gaussian_size); float* harmonic_data; cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2); int2 harmonic_size; harmonic_size.x = _filter_size; harmonic_size.y = _filter_size; int2 harmonic_center; harmonic_center.x = _filter_size / 2; harmonic_center.y = _filter_size / 2; harmonic(harmonic_data, 0, _lambda, 0.0, harmonic_center, harmonic_size); float* host_harmonic = new float[_filter_size * _filter_size * 2]; cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2); int2 gabor_size; gabor_size.x = _filter_size; gabor_size.y = _filter_size; int2 gabor_center; gabor_center.x = _filter_size / 2; gabor_center.y = _filter_size / 2; multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size); float* host_gabor_data = new float[_filter_pixels * 2]; cudaMemcpy(host_gabor_data, _gabor_data, sizeof(float) * _filter_pixels * 2, cudaMemcpyDeviceToHost); //pad the filter { float* data = host_gabor_data; float* target = _filter_image; memset(target, 0, sizeof(float) * _padded_pixels * 2); int padded_stride = 2 * _padded_size; int target_stride = 2 * _target_size; for (int i = 0; i < _target_size; ++i) { memcpy(target, data, sizeof(float) * target_stride); target += padded_stride; data += target_stride; } } // Copy gabor data into member for texture creation _filter_image_mutex.lock(); memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2); _filter_image_mutex.unlock(); cudaFree(_gabor_data); cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2); cudaMemcpy(_gabor_data, _filter_image, sizeof(float) * _padded_pixels * 2, cudaMemcpyHostToDevice); cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C); cufftExecC2C(_fft_plan, (cufftComplex*)(_gabor_data), (cufftComplex*)(_gabor_data), CUFFT_FORWARD); cudaMemcpy(_filter_image, _gabor_data, sizeof(float) * _padded_pixels * 2, cudaMemcpyDeviceToHost); emit newFilterImage(); }
cufftResult WINAPI wine_cufftExecC2C(cufftHandle plan, cufftComplex *idata, cufftComplex *odata, int direction){ WINE_TRACE("\n"); return cufftExecC2C( plan, idata, odata, direction ); }
void WorkerThread::gaborFilter(float *data) { //get the subimage and pad it { float* target = _padded_image; memset(target, 0, sizeof(float) * _padded_pixels * 2); int left = _target_x - _target_size / 2; int bottom = _target_y - _target_size / 2; data += 2 * left; int original_stride = 2 * _original_image_width; int padded_stride = 2 * _padded_size; int target_stride = 2 * _target_size; data += bottom * original_stride; for (int i = 0; i < _target_size; ++i) { memcpy(target, data, sizeof(float) * target_stride); target += padded_stride; data += original_stride; } } float* gpu_image = NULL; float* diff_gpu_image = NULL; if( _curr_gpu_image == 0 ) { _curr_gpu_image = 1; gpu_image = _gpu_image_0; diff_gpu_image = _gpu_image_1; } else { _curr_gpu_image = 0; gpu_image = _gpu_image_1; diff_gpu_image = _gpu_image_0; } cudaMemcpy(gpu_image, _padded_image, sizeof(float) * _padded_pixels * 2, cudaMemcpyHostToDevice); cufftExecC2C(_fft_plan, (cufftComplex*)gpu_image, (cufftComplex*)gpu_image, CUFFT_FORWARD); multiplyComplexComplex(_gabor_data, gpu_image, gpu_image, _padded_pixels); cufftExecC2C(_fft_plan, (cufftComplex*)gpu_image, (cufftComplex*)gpu_image, CUFFT_INVERSE); if( _do_difference_images ) { differenceImages( diff_gpu_image, gpu_image, diff_gpu_image, _padded_pixels); cudaMemcpy(_padded_image, diff_gpu_image, sizeof(float) * _padded_pixels * 2, cudaMemcpyDeviceToHost); } else { cudaMemcpy(_padded_image, gpu_image, sizeof(float) * _padded_pixels * 2, cudaMemcpyDeviceToHost); } // Extract 128x128 from padded result { float* data = _padded_image; float* target = _result_data; int left = _filter_size / 2; int bottom = _filter_size / 2; data += 2 * left; int data_stride = 2 * _padded_size; int target_stride = 2 * _target_size; data += bottom * data_stride; for (int i = 0; i < _target_size; ++i) { memcpy(target, data, sizeof(float) * target_stride); target += target_stride; data += data_stride; } } }
void Fastconv_base<D, T, ComplexFmt>::fconv (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel) { size_t kernel_size = (D == 1) ? columns : rows * columns; // allocate device memory and copy input and kernel over from host Device_storage<T> dev_out(rows * columns); Device_storage<T> dev_kernel(kernel_size); Device_storage<T> dev_in(rows * columns); // If the kernel is a matrix, it is assumed to be row-major and dense. // As a result, it can be copied as one contiguous chunk. cudaMemcpy( dev_kernel.data(), kernel, kernel_size * sizeof(T), cudaMemcpyHostToDevice); ASSERT_CUDA_OK(); // Transfer the input (row major, dense) cudaMemcpy( dev_in.data(), in, rows * columns * sizeof(T), cudaMemcpyHostToDevice); ASSERT_CUDA_OK(); // convert pointers to types the CUFFT library accepts typedef cufftComplex ctype; ctype* d_out = reinterpret_cast<ctype*>(dev_out.data()); ctype* d_kernel = reinterpret_cast<ctype*>(dev_kernel.data()); ctype* d_in = reinterpret_cast<ctype*>(dev_in.data()); cufftHandle plan; if (transform_kernel) { // Create a 1D FFT plan and transform the kernel cufftPlan1d(&plan, columns, CUFFT_C2C, 1); cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD); cufftDestroy(plan); } // Create a FFTM plan cufftPlan1d(&plan, columns, CUFFT_C2C, rows); // transform the data cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD); // convolve with kernel, combine with scaling needed for inverse FFT typedef typename impl::Scalar_of<T>::type scalar_type; scalar_type scale = 1 / static_cast<scalar_type>(columns); if (D == 1) vmmuls_row_cc(d_kernel, d_in, d_out, scale, rows, columns); else mmmuls_cc(d_kernel, d_in, d_out, scale, rows, columns); // inverse transform the signal cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE); cufftDestroy(plan); // Move data back to the host from the output buffer cudaMemcpy( out, dev_out.data(), rows * columns * sizeof(T), cudaMemcpyDeviceToHost); ASSERT_CUDA_OK(); }