Exemplo n.º 1
0
/*!
	computes the N-point DFT of signal X and stores in Y using CUDA's FFT library
*/
bool cuda_dft(cuComplex *Y, cuComplex *X, float scale, int N) {
	size_t bytes = (size_t)N * sizeof(cuComplex);
	cuComplex *Y_gpu, *X_gpu;
	
	cudaMalloc((void **)&Y_gpu, bytes);
	cudaMalloc((void **)&X_gpu, bytes);

	cudaMemcpy(Y_gpu, Y, bytes, cudaMemcpyHostToDevice);
	cudaMemcpy(X_gpu, X, bytes, cudaMemcpyHostToDevice);

	cufftHandle plan;
	cufftPlan1d(&plan, N, CUFFT_C2C, 1);

	cufftExecC2C(plan, X_gpu, Y_gpu, CUFFT_FORWARD);

	cufftDestroy(plan);

	cudaMemcpy(Y, Y_gpu, bytes, cudaMemcpyDeviceToHost);

	cudaFree(Y_gpu);
	cudaFree(X_gpu);

	for (int n = 0; n < N; n++) {
		Y[n].x *= scale;
		Y[n].y *= scale;
	}

	return true;
}
Exemplo n.º 2
0
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftExecC2CNative
 * Signature: (Ljcuda/jcufft/cufftHandle;Ljcuda/Pointer;Ljcuda/Pointer;I)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftExecC2CNative
  (JNIEnv *env, jclass cla, jobject handle, jobject cIdata, jobject cOdata, jint direction)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftExecC2C");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (cIdata == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cIdata' is null for cufftExecC2C");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (cOdata == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cOdata' is null for cufftExecC2C");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Executing cufftExecC2C\n");

    cufftHandle nativePlan = env->GetIntField(handle, cufftHandle_plan);
    cufftComplex* nativeCIData = (cufftComplex*)getPointer(env, cIdata);
    cufftComplex* nativeCOData = (cufftComplex*)getPointer(env, cOdata);

    cufftResult result = cufftExecC2C(nativePlan, nativeCIData, nativeCOData, direction);
    return result;
}
Exemplo n.º 3
0
extern "C" void cuda_fft(float *d_data, int Lx, int Ny, void *stream)
{
	cufftHandle plan;
	cufftPlan1d(&plan, Lx, CUFFT_C2C, Ny);
	cufftSetStream(plan, (cudaStream_t)stream);
	cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD);
	cufftDestroy(plan);
}
Exemplo n.º 4
0
Arquivo: cu2.c Projeto: E-LLP/QuIP
void g_fwdfft(QSP_ARG_DECL  Data_Obj *dst_dp, Data_Obj *src1_dp)
{
	//Variable declarations
	int NX = 256;
	//int BATCH = 10;
	int BATCH = 1;
	cufftResult_t status;

	//Declare plan for FFT
	cufftHandle plan;
	//cufftComplex *data;
	//cufftComplex *result;
	void *data;
	void *result;
	cudaError_t drv_err;

	//Allocate RAM
	//cutilSafeCall(cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH));	
	//cutilSafeCall(cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH));
	drv_err = cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH);
	if( drv_err != cudaSuccess ){
		WARN("error allocating cuda data buffer for fft!?");
		return;
	}
	drv_err = cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH);
	if( drv_err != cudaSuccess ){
		WARN("error allocating cuda result buffer for fft!?");
		// BUG clean up previous malloc...
		return;
	}

	//Create plan for FFT
	status = cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
	if (status != CUFFT_SUCCESS) {
		sprintf(ERROR_STRING, "Error in cufftPlan1d: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}

	//Run forward fft on data
	status = cufftExecC2C(plan, (cufftComplex *)data,
			(cufftComplex *)result, CUFFT_FORWARD);
	if (status != CUFFT_SUCCESS) {
		sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}

	//Run inverse fft on data
	/*status = cufftExecC2C(plan, data, result, CUFFT_INVERSE);
	if (status != CUFFT_SUCCESS)
	{
		sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}*/

	//Free resources
	cufftDestroy(plan);
	cudaFree(data);
}
Exemplo n.º 5
0
/*
 * Excecute a local 1D FFT
 */
int dfft_cuda_local_fft(
    cuda_cpx_t *in,
    cuda_cpx_t *out,
    cuda_plan_t p,
    int dir)
    {
    cufftResult res;
    res = cufftExecC2C(p, in, out, dir ? CUFFT_INVERSE : CUFFT_FORWARD);
    return res;
    }
Exemplo n.º 6
0
void oskar_fft_exec(oskar_FFT* h, oskar_Mem* data, int* status)
{
    oskar_Mem *data_copy = 0, *data_ptr = data;
    if (oskar_mem_location(data) != h->location)
    {
        data_copy = oskar_mem_create_copy(data, h->location, status);
        data_ptr = data_copy;
    }
    if (h->location == OSKAR_CPU)
    {
        if (h->num_dim == 1)
        {
            *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE;
        }
        else if (h->num_dim == 2)
        {
            if (h->precision == OSKAR_DOUBLE)
                oskar_fftpack_cfft2f(h->dim_size, h->dim_size, h->dim_size,
                        oskar_mem_double(data_ptr, status),
                        oskar_mem_double(h->fftpack_wsave, status),
                        oskar_mem_double(h->fftpack_work, status));
            else
                oskar_fftpack_cfft2f_f(h->dim_size, h->dim_size, h->dim_size,
                        oskar_mem_float(data_ptr, status),
                        oskar_mem_float(h->fftpack_wsave, status),
                        oskar_mem_float(h->fftpack_work, status));
            /* This step not needed for W-kernel generation, so turn it off. */
            if (h->ensure_consistent_norm)
                oskar_mem_scale_real(data_ptr, (double)h->num_cells_total,
                        0, h->num_cells_total, status);
        }
    }
    else if (h->location == OSKAR_GPU)
    {
#ifdef OSKAR_HAVE_CUDA
        if (h->precision == OSKAR_DOUBLE)
            cufftExecZ2Z(h->cufft_plan,
                    (cufftDoubleComplex*) oskar_mem_void(data_ptr),
                    (cufftDoubleComplex*) oskar_mem_void(data_ptr),
                    CUFFT_FORWARD);
        else
            cufftExecC2C(h->cufft_plan,
                    (cufftComplex*) oskar_mem_void(data_ptr),
                    (cufftComplex*) oskar_mem_void(data_ptr),
                    CUFFT_FORWARD);
#endif
    }
    else
        *status = OSKAR_ERR_BAD_LOCATION;
    if (oskar_mem_location(data) != h->location)
        oskar_mem_copy(data, data_ptr, status);
    oskar_mem_free(data_copy, status);
}
Exemplo n.º 7
0
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream)
{
	cufftHandle plan;
	cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL);

	if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_C2C)!=CUFFT_SUCCESS) {
		fprintf(stderr, "CUFFT error: Plan creation failed");
	}

	cufftSetStream(plan, (cudaStream_t) stream);
	cufftExecC2C(plan, (cufftComplex*) d_data, (cufftComplex*) d_data, CUFFT_FORWARD);
	cufftDestroy(plan);
}
Exemplo n.º 8
0
void
Fastconv_base<D, T, C>::fconv
  (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel)
{
  // convert pointers to types the CUFFT library accepts
  typedef cufftComplex ctype;
  ctype* d_out = reinterpret_cast<ctype*>(out);
  ctype* d_kernel = const_cast<ctype*>(reinterpret_cast<ctype const*>(kernel));
  ctype* d_in = const_cast<ctype*>(reinterpret_cast<ctype const*>(in));

  cufftHandle plan;
  if (transform_kernel)
  {
    // Create a 1D FFT plan and transform the kernel
    cufftPlan1d(&plan, columns, CUFFT_C2C, 1);
    cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD);
    cufftDestroy(plan);
  }

  // Create a FFTM plan
  cufftPlan1d(&plan, columns, CUFFT_C2C, rows);

  // transform the data
  cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD);

  // convolve with kernel, combine with scaling needed for inverse FFT
  typedef typename impl::scalar_of<T>::type scalar_type;
  scalar_type scale = 1 / static_cast<scalar_type>(columns);
  if (D == 1)
    vmmuls_row(kernel, in, out, scale, rows, columns);
  else
    mmmuls(kernel, in, out, scale, rows, columns);

  // inverse transform the signal
  cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE);
  cufftDestroy(plan);
}
Exemplo n.º 9
0
    static cufftResult_t centerdifft(const std::complex<float>* in_mat, std::complex<float>* out_mat,
                                     cufftHandle* fftplan)
    {
      cufftResult_t cufftResult;

      cufftResult = cufftExecC2C(*fftplan,
                   (cufftComplex*)in_mat,
                   (cufftComplex*)out_mat,
                   CUFFT_INVERSE);
      
      AGILE_ASSERT(result == CUFFT_SUCCESS,
                        StandardException::ExceptionMessage(
                          "Error during FFT procedure"));

      return cufftResult;
    }
Exemplo n.º 10
0
      void inverse(const GPUMatrixPitched<TSpectralType>& fft_x,
                   GPUMatrixPitched<TSignalType>& x)
      {
        // assert that we have no zero-padded lines
        AGILE_ASSERT(x.getNumColumns() == x.getPitchElements(),
                      StandardException::ExceptionMessage(
                        "FFT of non-aligned matrices not implemented, yet"));

        // create a plan
        createPlan(x.getNumRows(), x.getNumColumns());
        cufftResult result = cufftExecC2C(
          m_plan,
          (typename to_cufft_type<TSpectralType>::type*)fft_x.data(),
          (typename to_cufft_type<TSignalType>::type*)x.data(),
          CUFFT_INVERSE);
        AGILE_ASSERT(result == CUFFT_SUCCESS,
                        StandardException::ExceptionMessage(
                          "Error during FFT procedure"));

        scale(TSignalType(1./(fft_x.getNumRows() * fft_x.getNumColumns())), x, x);
      }
Exemplo n.º 11
0
void oskar_imager_finalise_plane(oskar_Imager* h, oskar_Mem* plane,
        double plane_norm, int* status)
{
    int size, num_cells;
    DeviceData* d;
    if (*status) return;

    /* Apply normalisation. */
    if (plane_norm > 0.0 || plane_norm < 0.0)
        oskar_mem_scale_real(plane, 1.0 / plane_norm, status);
    if (h->algorithm == OSKAR_ALGORITHM_DFT_2D ||
            h->algorithm == OSKAR_ALGORITHM_DFT_3D)
        return;

    /* Check plane is complex type, as plane must be gridded visibilities. */
    if (!oskar_mem_is_complex(plane))
    {
        *status = OSKAR_ERR_TYPE_MISMATCH;
        return;
    }

    /* Make image using FFT and apply grid correction. */
    size = h->grid_size;
    num_cells = size * size;
    d = &h->d[0];
    if (oskar_mem_precision(plane) == OSKAR_DOUBLE)
    {
        oskar_fftphase_cd(size, size, oskar_mem_double(plane, status));
        if (h->fft_on_gpu)
        {
#ifdef OSKAR_HAVE_CUDA
            oskar_device_set(h->cuda_device_ids[0], status);
            oskar_mem_copy(d->plane_gpu, plane, status);
            cufftExecZ2Z(h->cufft_plan, oskar_mem_void(d->plane_gpu),
                    oskar_mem_void(d->plane_gpu), CUFFT_FORWARD);
            oskar_mem_copy(plane, d->plane_gpu, status);
#else
            *status = OSKAR_ERR_CUDA_NOT_AVAILABLE;
#endif
        }
        else
        {
            oskar_fftpack_cfft2f(size, size, size,
                    oskar_mem_double(plane, status),
                    oskar_mem_double(h->fftpack_wsave, status),
                    oskar_mem_double(h->fftpack_work, status));
            oskar_mem_scale_real(plane, (double)num_cells, status);
        }
        oskar_fftphase_cd(size, size, oskar_mem_double(plane, status));
        oskar_grid_correction_d(size, oskar_mem_double(h->corr_func, status),
                oskar_mem_double(plane, status));
    }
    else
    {
        oskar_fftphase_cf(size, size, oskar_mem_float(plane, status));
        if (h->fft_on_gpu)
        {
#ifdef OSKAR_HAVE_CUDA
            oskar_device_set(h->cuda_device_ids[0], status);
            oskar_mem_copy(d->plane_gpu, plane, status);
            cufftExecC2C(h->cufft_plan, oskar_mem_void(d->plane_gpu),
                    oskar_mem_void(d->plane_gpu), CUFFT_FORWARD);
            oskar_mem_copy(plane, d->plane_gpu, status);
#else
            *status = OSKAR_ERR_CUDA_NOT_AVAILABLE;
#endif
        }
        else
        {
            oskar_fftpack_cfft2f_f(size, size, size,
                    oskar_mem_float(plane, status),
                    oskar_mem_float(h->fftpack_wsave, status),
                    oskar_mem_float(h->fftpack_work, status));
            oskar_mem_scale_real(plane, (double)num_cells, status);
        }
        oskar_fftphase_cf(size, size, oskar_mem_float(plane, status));
        oskar_grid_correction_f(size, oskar_mem_double(h->corr_func, status),
                oskar_mem_float(plane, status));
    }
}
Exemplo n.º 12
0
int main(int argc, char *argv[]) {
	int i;
	struct timeval begin, end;
	int size;
	size_t bytes;
	int n = 0, m = 0;
	STARPUFFT(plan) plan;
#ifdef STARPU_HAVE_FFTW
	_FFTW(plan) fftw_plan;
#endif
#ifdef STARPU_USE_CUDA
	cufftHandle cuda_plan;
	cudaError_t cures;
#endif
	double timing;

	if (argc < 2 || argc > 3) {
		fprintf(stderr,"need one or two size of vector\n");
		exit(EXIT_FAILURE);
	}

	starpu_init(NULL);

	if (argc == 2) {
		n = atoi(argv[1]);

		/* 1D */
		size = n;
	} else if (argc == 3) {
		n = atoi(argv[1]);
		m = atoi(argv[2]);

		/* 2D */
		size = n * m;
	} else {
		assert(0);
	}

	bytes = size * sizeof(STARPUFFT(complex));

	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
	starpu_srand48(0);
	for (i = 0; i < size; i++)
		in[i] = starpu_drand48() + I * starpu_drand48();

	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
#endif

#ifdef STARPU_USE_CUDA
	STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda));
#endif

	if (argc == 2) {
		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
			printf("erf\n");
#endif

	} else if (argc == 3) {
		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
#endif
	} else {
		assert(0);
	}

#ifdef STARPU_HAVE_FFTW
	gettimeofday(&begin, NULL);
	_FFTW(execute)(fftw_plan);
	gettimeofday(&end, NULL);
	_FFTW(destroy_plan)(fftw_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif
#ifdef STARPU_USE_CUDA
	gettimeofday(&begin, NULL);
	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
		printf("erf2\n");
	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
		STARPU_CUDA_REPORT_ERROR(cures);
	gettimeofday(&end, NULL);
	cufftDestroy(cuda_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif

	STARPUFFT(execute)(plan, in, out);

	STARPUFFT(showstats)(stdout);
	STARPUFFT(destroy_plan)(plan);

	printf("\n");
#if 0
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
	printf("\n\n");
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
	printf("\n\n");
#ifdef STARPU_HAVE_FFTW
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
	printf("\n\n");
#endif
#endif

#ifdef STARPU_HAVE_FFTW
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

#ifdef STARPU_USE_CUDA
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out_cuda[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

	STARPUFFT(free)(in);
	STARPUFFT(free)(out);

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(free)(out_fftw);
#endif

#ifdef STARPU_USE_CUDA
	free(out_cuda);
#endif

	starpu_shutdown();

	return EXIT_SUCCESS;
}
Exemplo n.º 13
0
bool test2(void)
{
    float
    *h_Data,
    *h_Kernel,
    *h_ResultCPU,
    *h_ResultGPU;

    float
    *d_Data,
    *d_Kernel,
    *d_PaddedData,
    *d_PaddedKernel;

    fComplex
    *d_DataSpectrum0,
    *d_KernelSpectrum0;

    cufftHandle
    fftPlan;

    bool bRetVal;
    StopWatchInterface *hTimer = NULL;
    sdkCreateTimer(&hTimer);

    printf("Testing updated custom R2C / C2R FFT-based convolution\n");
    const int kernelH = 7;
    const int kernelW = 6;
    const int kernelY = 3;
    const int kernelX = 4;
    const int dataH = 2000;
    const int dataW = 2000;
    const int fftH = snapTransformSize(dataH + kernelH - 1);
    const int fftW = snapTransformSize(dataW + kernelW - 1);

    printf("...allocating memory\n");
    h_Data      = (float *)malloc(dataH   * dataW * sizeof(float));
    h_Kernel    = (float *)malloc(kernelH * kernelW * sizeof(float));
    h_ResultCPU = (float *)malloc(dataH   * dataW * sizeof(float));
    h_ResultGPU = (float *)malloc(fftH    * fftW * sizeof(float));

    checkCudaErrors(cudaMalloc((void **)&d_Data,   dataH   * dataW   * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_PaddedData,   fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0,   fftH * (fftW / 2) * sizeof(fComplex)));
    checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex)));

    printf("...generating random input data\n");
    srand(2010);

    for (int i = 0; i < dataH * dataW; i++)
    {
        h_Data[i] = getRand();
    }

    for (int i = 0; i < kernelH * kernelW; i++)
    {
        h_Kernel[i] = getRand();
    }

    printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2);
    checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C));

    printf("...uploading to GPU and padding convolution kernel and input data\n");
    checkCudaErrors(cudaMemcpy(d_Data,   h_Data,   dataH   * dataW *   sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemset(d_PaddedData,   0, fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)));

    padDataClampToBorder(
        d_PaddedData,
        d_Data,
        fftH,
        fftW,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    padKernel(
        d_PaddedKernel,
        d_Kernel,
        fftH,
        fftW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    //CUFFT_INVERSE works just as well...
    const int FFT_DIR = CUFFT_FORWARD;

    //Not including kernel transformation into time measurement,
    //since convolution kernel is not changed very frequently
    printf("...transforming convolution kernel\n");
    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR));

    printf("...running GPU FFT convolution: ");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR));
    spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, fftW / 2, FFT_DIR);
    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR));

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    double gpuTime = sdkGetTimerValue(&hTimer);
    printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime);

    printf("...reading back GPU FFT results\n");
    checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost));

    printf("...running reference CPU convolution\n");
    convolutionClampToBorderCPU(
        h_ResultCPU,
        h_Data,
        h_Kernel,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    printf("...comparing the results: ");
    double sum_delta2 = 0;
    double sum_ref2   = 0;
    double max_delta_ref = 0;

    for (int y = 0; y < dataH; y++)
    {
        for (int x = 0; x < dataW; x++)
        {
            double  rCPU = (double)h_ResultCPU[y * dataW + x];
            double  rGPU = (double)h_ResultGPU[y * fftW  + x];
            double delta = (rCPU - rGPU) * (rCPU - rGPU);
            double   ref = rCPU * rCPU + rCPU * rCPU;

            if ((delta / ref) > max_delta_ref)
            {
                max_delta_ref = delta / ref;
            }

            sum_delta2 += delta;
            sum_ref2   += ref;
        }
    }

    double L2norm = sqrt(sum_delta2 / sum_ref2);
    printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref));
    bRetVal = (L2norm < 1e-6) ? true : false;
    printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n");

    printf("...shutting down\n");
    sdkStartTimer(&hTimer);
    checkCudaErrors(cufftDestroy(fftPlan));

    checkCudaErrors(cudaFree(d_KernelSpectrum0));
    checkCudaErrors(cudaFree(d_DataSpectrum0));
    checkCudaErrors(cudaFree(d_PaddedKernel));
    checkCudaErrors(cudaFree(d_PaddedData));
    checkCudaErrors(cudaFree(d_Kernel));
    checkCudaErrors(cudaFree(d_Data));

    free(h_ResultGPU);
    free(h_ResultCPU);
    free(h_Kernel);
    free(h_Data);

    return bRetVal;
}
Exemplo n.º 14
0
/**
 * Execute single precision C2C plan. This function is blocking and only returns after the transform is completed.
 * @note For inplace transforms, data_out should point to the same memory address as data, AND
 * the plan must have been created as inplace.
 * @param plan FFT plan created by \ref accfft_plan_dft_3d_r2cf.
 * @param data Input data in frequency domain.
 * @param data_out Output data in frequency domain.
 * @param timer See \ref timer for more details.
 * @param XYZ a bit set field that determines which directions FFT should be executed
 */
void accfft_execute_c2c_gpuf(accfft_plan_gpuf* plan, int direction,Complexf * data_d, Complexf * data_out_d, double * timer,std::bitset<3> xyz){

  if(!plan->c2c_plan_baked){
    if(plan->procid==0) std::cout<<"Error. r2c plan has not been made correctly. Please first create the plan before calling execute functions."<<std::endl;
    return;
  }

  if(data_d==NULL)
    data_d=plan->data_c;
  if(data_out_d==NULL)
    data_out_d=plan->data_out_c;
  int * coords=plan->coord;
  int procid=plan->procid;
  double fft_time=0;
  double timings[5]={0};

  int NY=plan->N[1];
  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
  cudaEvent_t fft_startEvent, fft_stopEvent;
  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
  cufftResult_t cufft_error;
  float dummy_time=0;

  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  if(direction==-1){
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/
    // FFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft(cufftExecC2C(plan->fplan_0,(cufftComplex*)data_d, (cufftComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

    if(!plan->oneD){
      plan->T_plan_1->execute_gpu(plan->T_plan_1,(float*)data_out_d,timings,2,osize_0[0],coords[0]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1[0];++i){
        checkCuda_accfft(cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_out_d[i*osize_1[1]*osize_1[2]], (cufftComplex*)&data_out_d[i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    MPI_Barrier(plan->c_comm);



    if(plan->oneD){
      plan->T_plan_2->execute_gpu(plan->T_plan_2,(float*)data_out_d,timings,2);
    }
    else{
      plan->T_plan_2->execute_gpu(plan->T_plan_2,(float*)data_out_d,timings,2,1,coords[1]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0 x N1/P0 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft(cufftExecC2C(plan->fplan_2,(cufftComplex*)data_out_d, (cufftComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

  }
  else if (direction==1){
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_d, (cufftComplex*)data_d,CUFFT_INVERSE));
      checkCuda_accfft (cudaDeviceSynchronize());
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }


    if(plan->oneD){
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1);
    }
    else{
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1,1,coords[1]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1i[0];++i){
        checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_d[i*NY*osize_1i[2]], (cufftComplex*)&data_d[i*NY*osize_1i[2]],CUFFT_INVERSE));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    MPI_Barrier(plan->c_comm);

    if(!plan->oneD){
      plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(float*)data_d,timings,1,osize_1i[0],coords[0]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/

    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecC2C(plan->fplan_0,(cufftComplex*)data_d,(cufftComplex*)data_out_d,CUFFT_INVERSE));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

  }

  timings[4]+=fft_time;
  if(timer==NULL){
    //delete [] timings;
  }
  else{
    timer[0]+=timings[0];
    timer[1]+=timings[1];
    timer[2]+=timings[2];
    timer[3]+=timings[3];
    timer[4]+=timings[4];
  }
  MPI_Barrier(plan->c_comm);

  return;
}
Exemplo n.º 15
0
void accfft_execute_gpuf(accfft_plan_gpuf* plan, int direction,float * data_d, float * data_out_d, double * timer,std::bitset<3> xyz){

  if(data_d==NULL)
    data_d=plan->data;
  if(data_out_d==NULL)
    data_out_d=plan->data_out;

  int * coords=plan->coord;
  int procid=plan->procid;
  double fft_time=0;
  double timings[5]={0};

  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
  cudaEvent_t fft_startEvent, fft_stopEvent;
  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
  int NY=plan->N[1];
  float dummy_time=0;


  int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1;
  //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i;
  //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  if(direction==-1){
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/
    // FFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecR2C(plan->fplan_0,(cufftReal*)data_d, (cufftComplex*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

    // Perform N0/P0 transpose


    if(!plan->oneD){
      plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1[0];++i){
        checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }

    if(plan->oneD){
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2);
    }
    else{
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]);
    }
    /**************************************************************/
    /*******************  N0 x N1/P0 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_out_d, (cufftComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
  }
  else if (direction==1){
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_d, (cufftComplex*)data_d,CUFFT_INVERSE));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }

    if(plan->oneD){
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1);
    }
    else{
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1,1,coords[1]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1i[0];++i){
        checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }



    if(!plan->oneD){
      plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(float*)data_d,timings,1,osize_1i[0],coords[0]);
    }
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/

    // IFFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecC2R(plan->iplan_0,(cufftComplex*)data_d,(cufftReal*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

  }

  timings[4]+=fft_time;
  if(timer==NULL){
    //delete [] timings;
  }
  else{
    timer[0]+=timings[0];
    timer[1]+=timings[1];
    timer[2]+=timings[2];
    timer[3]+=timings[3];
    timer[4]+=timings[4];
  }
  MPI_Barrier(plan->c_comm);

  return;
}
Exemplo n.º 16
0
void WorkerThread::createNewFilter()
{
	// Free GPU memory from current filter and CUFFT
	cudaFree(_gabor_data);
	cudaFree(_gpu_image_0);
	cudaFree(_gpu_image_1);
	cufftDestroy(_fft_plan);

	float* gaussian_data;
	cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels);
	int2 gaussian_size;
	gaussian_size.x = _filter_size;
	gaussian_size.y = _filter_size;
	int2 gaussian_center;
	gaussian_center.x = _filter_size / 2;
	gaussian_center.y = _filter_size / 2;
	gaussian(gaussian_data, _new_theta, _new_sigma, 1.0, gaussian_center, gaussian_size);
	
	float* harmonic_data;
	cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2);
	int2 harmonic_size;
	harmonic_size.x = _filter_size;
	harmonic_size.y = _filter_size;
	int2 harmonic_center;
	harmonic_center.x = _filter_size / 2;
	harmonic_center.y = _filter_size / 2;
	harmonic(harmonic_data, _new_theta, _new_lambda, _new_psi, harmonic_center, harmonic_size);
	float* host_harmonic = new float[_filter_size * _filter_size * 2];
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2);
	int2 gabor_size;
	gabor_size.x = _filter_size;
	gabor_size.y = _filter_size;
	int2 gabor_center;
	gabor_center.x = _filter_size / 2;
	gabor_center.y = _filter_size / 2;
	multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size);
	float* host_gabor_data = new float[_filter_pixels * 2];
	cudaMemcpy(host_gabor_data,
		_gabor_data,
		sizeof(float) * _filter_pixels * 2,
		cudaMemcpyDeviceToHost);

	//pad the filter
	{
		float* data = host_gabor_data;
		float* target = _filter_image;
		memset(target, 0, sizeof(float) * _padded_pixels * 2);
		int padded_stride = 2 * _padded_size;
		int target_stride = 2 * _target_size;
		for (int i = 0; i < _target_size; ++i)
		{
			memcpy(target, data, sizeof(float) * target_stride);
			target += padded_stride;
			data += target_stride;
		}
	}

	// Copy gabor data into member for texture creation
	_filter_image_mutex.lock();
	memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2);
	_filter_image_mutex.unlock();
	
	cudaFree(_gabor_data);
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2);
	cudaMemcpy(_gabor_data,
		_filter_image,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyHostToDevice);

	cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C);
	cufftExecC2C(_fft_plan,
		(cufftComplex*)(_gabor_data),
		(cufftComplex*)(_gabor_data),
		CUFFT_FORWARD);
	cudaMemcpy(_filter_image,
		_gabor_data,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyDeviceToHost);

	// Free temporary GPU memory used for creation of filter
	cudaFree(gaussian_data);
	cudaFree(harmonic_data);

	delete host_harmonic;
	delete host_gabor_data;

	_should_create_new_filter = false;

	emit newFilterImage();
}
Exemplo n.º 17
0
int main2(int sockfd)
{
        cufftHandle plan;
        cufftComplex *devPtr;
        cufftReal indata[NX*BATCH];
        cufftComplex data[NX*BATCH];
        int i,timer,j,k;
        char fname[15];
        FILE *f;
	#define BUFSIZE (21*4096*sizeof(int))
	int buffer[BUFSIZE];

        int p,nread;


	f = fopen("21-4096","rb");
	nread=fread(buffer,BUFSIZE,1,f);
	printf("nread=%i\n",nread);
	fclose(f);

        i=0;
        for (j=0;j<BATCH;j++) {
            for (k=0;k<NX;k++) {
                data[j*NX+k].x = buffer[j*NX+k];
                data[j*NX+k].y = 0;
            }
	}


        //f=fopen("y.txt","r");
    /* source data creation */

        //int sockfd = myconnect();
        //printf("connected\n");
	
		

        /* WORKING!!!!!!!!
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"r");
            for (k=0;k<NX;k++) {
                fscanf(f,"%i\n",&p);
                data[j*NX+k].x = p;
                data[j*NX+k].y = 0;
            }
            fclose(f);
	*/
/*
        for(i=  0 ; i < NX*BATCH ; i++){
                //fscanf(f,"%i\n",&p);
                //data[i].x= p;
                data[i].x= 1.0f;
                //printf("%f\n",data[i].x);
                data[i].y = 0.0f;
        }
        //fclose(f)
        */
        //}


        /* creates 1D FFT plan */
        cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);


        /*
        cutCreateTimer(&timer);
        cutResetTimer(timer);
        cutStartTimer(timer);
        */
        
    /* GPU memory allocation */
        cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH);

    /* transfer to GPU memory */
        cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);


        /* executes FFT processes */
        cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);

        /* executes FFT processes (inverse transformation) */
       //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE);

    /* transfer results from GPU memory */
        cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost);

        /* deletes CUFFT plan */
        cufftDestroy(plan);

    /* frees GPU memory */
        cudaFree(devPtr);

        /*
        cudaThreadSynchronize();
        cutStopTimer(timer);
        printf("%f\n",cutGetTimerValue(timer)/(float)1000);
        cutDeleteTimer(timer);
        */

        /*
        float mag;
        for(i = 0 ; i < NX*BATCH ; i++){
                //printf("data[%d] %f %f\n", i, data[i].x, data[i].y);
                //printf("%f\n", data[i].x);
                mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                printf("%f\n",mag);

        }
        */

/*
        // save as text file
        float mag;
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i-mag.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"w");
            for (k=0;k<NX;k++) {
                //fscanf(f,"%i\n",&p);
                if (k>50)
                    continue;
                i = j*NX+k;
                mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                fprintf(f,"%f\n",mag);
            }
            fclose(f);
        }
*/


        float mag;
        i=0;
        float mags[NX];
        int magsint[NX*BATCH];
        memset(magsint,0,sizeof(int)*NX*BATCH);
        int u = 0;

        printf("%f %f %f %f\n",data[0].x,data[1].x,data[2].x,data[3].x);

        //printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]);

//        f = fopen("ffts.bin","wb");
        for (j=0;j<BATCH;j++) {
//            sprintf(fname,"%i-bin.dat",j);
//            printf("%s\n",fname);

            for (k=0;k<NX;k++) {
                //fscanf(f,"%i\n",&p);
                if (k>50)
                    continue;
                i = j*NX+k;
                mags[k] = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                magsint[u]=mags[k]    ;
                u++;
                //fprintf(f,"%f\n",mag);
                
            }

            //f = fopen(fname,"wb");
  //          fwrite(magsint,sizeof(int)*50,1,f);
        }
        int n;
        n = write(sockfd,magsint,sizeof(int)*BATCH*50);
        printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]);
        printf("send ok, size: %i\n",n);
        //fclose(f);
        
        
        return 0;
}
Exemplo n.º 18
0
void WorkerThread::createInitialFilter()
{
	float* gaussian_data;
	cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels);
	int2 gaussian_size;
	gaussian_size.x = _filter_size;
	gaussian_size.y = _filter_size;
	int2 gaussian_center;
	gaussian_center.x = _filter_size / 2;
	gaussian_center.y = _filter_size / 2;
	gaussian(gaussian_data, 0.0, _sigma, 1.0, gaussian_center, gaussian_size);
	
	float* harmonic_data;
	cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2);
	int2 harmonic_size;
	harmonic_size.x = _filter_size;
	harmonic_size.y = _filter_size;
	int2 harmonic_center;
	harmonic_center.x = _filter_size / 2;
	harmonic_center.y = _filter_size / 2;
	harmonic(harmonic_data, 0, _lambda, 0.0, harmonic_center, harmonic_size);
	float* host_harmonic = new float[_filter_size * _filter_size * 2];
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2);
	int2 gabor_size;
	gabor_size.x = _filter_size;
	gabor_size.y = _filter_size;
	int2 gabor_center;
	gabor_center.x = _filter_size / 2;
	gabor_center.y = _filter_size / 2;
	multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size);
	float* host_gabor_data = new float[_filter_pixels * 2];
	cudaMemcpy(host_gabor_data,
		_gabor_data,
		sizeof(float) * _filter_pixels * 2,
		cudaMemcpyDeviceToHost);

	//pad the filter
	{
		float* data = host_gabor_data;
		float* target = _filter_image;
		memset(target, 0, sizeof(float) * _padded_pixels * 2);
		int padded_stride = 2 * _padded_size;
		int target_stride = 2 * _target_size;
		for (int i = 0; i < _target_size; ++i)
		{
			memcpy(target, data, sizeof(float) * target_stride);
			target += padded_stride;
			data += target_stride;
		}
	}

	// Copy gabor data into member for texture creation
	_filter_image_mutex.lock();
	memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2);
	_filter_image_mutex.unlock();
	
	cudaFree(_gabor_data);
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2);
	cudaMemcpy(_gabor_data,
		_filter_image,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyHostToDevice);

	cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C);
	cufftExecC2C(_fft_plan,
		(cufftComplex*)(_gabor_data),
		(cufftComplex*)(_gabor_data),
		CUFFT_FORWARD);
	cudaMemcpy(_filter_image,
		_gabor_data,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyDeviceToHost);

	emit newFilterImage();
}
Exemplo n.º 19
0
cufftResult WINAPI wine_cufftExecC2C(cufftHandle plan, cufftComplex *idata, cufftComplex *odata, int direction){
	WINE_TRACE("\n");
	return cufftExecC2C( plan, idata, odata, direction );
}
Exemplo n.º 20
0
void WorkerThread::gaborFilter(float *data)
{
	//get the subimage and pad it
	{
		float* target = _padded_image;
		memset(target, 0, sizeof(float) * _padded_pixels * 2);
		int left = _target_x - _target_size / 2;
		int bottom = _target_y - _target_size / 2;
		data += 2 * left;
		int original_stride = 2 * _original_image_width;
		int padded_stride = 2 * _padded_size;
		int target_stride = 2 * _target_size;
		data += bottom * original_stride;
		for (int i = 0; i < _target_size; ++i)
		{
			memcpy(target, data, sizeof(float) * target_stride);
			target += padded_stride;
			data += original_stride;
		}
	}
	
	float* gpu_image = NULL;
	float* diff_gpu_image = NULL;
	if( _curr_gpu_image == 0 )
	{
		_curr_gpu_image = 1;
		gpu_image = _gpu_image_0;	
		diff_gpu_image = _gpu_image_1;	
	}
	else
	{
		_curr_gpu_image = 0;
		gpu_image = _gpu_image_1;	
		diff_gpu_image = _gpu_image_0;	
	}
	
	cudaMemcpy(gpu_image,
		_padded_image,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyHostToDevice);
	
	cufftExecC2C(_fft_plan,
		(cufftComplex*)gpu_image,
		(cufftComplex*)gpu_image,
		CUFFT_FORWARD);
	
	multiplyComplexComplex(_gabor_data, gpu_image, gpu_image, _padded_pixels);
	
	cufftExecC2C(_fft_plan,
		(cufftComplex*)gpu_image,
		(cufftComplex*)gpu_image,
		CUFFT_INVERSE);

	if( _do_difference_images )
	{
		differenceImages( diff_gpu_image, gpu_image, diff_gpu_image, _padded_pixels);
		cudaMemcpy(_padded_image,
			diff_gpu_image,
			sizeof(float) * _padded_pixels * 2,
			cudaMemcpyDeviceToHost);
	}
	else
	{
		cudaMemcpy(_padded_image,
			gpu_image,
			sizeof(float) * _padded_pixels * 2,
			cudaMemcpyDeviceToHost);
	}


	// Extract 128x128 from padded result
	{
		float* data = _padded_image;
		float* target = _result_data;
		int left = _filter_size / 2;
		int bottom = _filter_size / 2;
		data += 2 * left;
		int data_stride = 2 * _padded_size;
		int target_stride = 2 * _target_size;
		data += bottom * data_stride;
		for (int i = 0; i < _target_size; ++i)
		{
			memcpy(target, data, sizeof(float) * target_stride);
			target += target_stride;
			data += data_stride;
		}
	}
}
Exemplo n.º 21
0
void
Fastconv_base<D, T, ComplexFmt>::fconv
  (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel)
{
  size_t kernel_size = (D == 1) ? columns : rows * columns;

  // allocate device memory and copy input and kernel over from host
  Device_storage<T> dev_out(rows * columns);
  Device_storage<T> dev_kernel(kernel_size);
  Device_storage<T> dev_in(rows * columns);

  // If the kernel is a matrix, it is assumed to be row-major and dense.
  // As a result, it can be copied as one contiguous chunk.
  cudaMemcpy(
    dev_kernel.data(),
    kernel,
    kernel_size * sizeof(T), 
    cudaMemcpyHostToDevice);
  ASSERT_CUDA_OK();

  // Transfer the input (row major, dense)
  cudaMemcpy(
    dev_in.data(),
    in,
    rows * columns * sizeof(T), 
    cudaMemcpyHostToDevice);
  ASSERT_CUDA_OK();
 

  // convert pointers to types the CUFFT library accepts
  typedef cufftComplex ctype;
  ctype* d_out = reinterpret_cast<ctype*>(dev_out.data());
  ctype* d_kernel = reinterpret_cast<ctype*>(dev_kernel.data());
  ctype* d_in = reinterpret_cast<ctype*>(dev_in.data());

  cufftHandle plan;
  if (transform_kernel)
  {
    // Create a 1D FFT plan and transform the kernel
    cufftPlan1d(&plan, columns, CUFFT_C2C, 1);
    cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD);
    cufftDestroy(plan);
  }

  // Create a FFTM plan
  cufftPlan1d(&plan, columns, CUFFT_C2C, rows);

  // transform the data
  cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD);

  // convolve with kernel, combine with scaling needed for inverse FFT
  typedef typename impl::Scalar_of<T>::type scalar_type;
  scalar_type scale = 1 / static_cast<scalar_type>(columns);
  if (D == 1)
    vmmuls_row_cc(d_kernel, d_in, d_out, scale, rows, columns);
  else
    mmmuls_cc(d_kernel, d_in, d_out, scale, rows, columns);

  // inverse transform the signal
  cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE);
  cufftDestroy(plan);

  // Move data back to the host from the output buffer
  cudaMemcpy(
    out,
    dev_out.data(),
    rows * columns * sizeof(T), 
    cudaMemcpyDeviceToHost);
  ASSERT_CUDA_OK();
}