Пример #1
0
void plan_fft(FFT_plans *plans, Arrays *arr,
	      Detector_settings *sett, Command_line_opts *opts)
{
  /*
    ############ FFT Plans ################
  */

  //arrlen is maximum of Ninterp and fftpad*nfft
  arr->arr_len = (sett->fftpad * sett->nfft > sett->Ninterp ? sett->fftpad * sett->nfft : sett->Ninterp);

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa, arr->arr_len*sizeof(cufftDoubleComplex)) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb, arr->arr_len*sizeof(cufftDoubleComplex)) );

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xar, arr->arr_len*sizeof(cufftDoubleComplex)) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xbr, arr->arr_len*sizeof(cufftDoubleComplex)) );


  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa_f, arr->arr_len*sizeof(COMPLEX_TYPE)) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb_f, arr->arr_len*sizeof(COMPLEX_TYPE)) );

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xar_f, arr->arr_len*sizeof(COMPLEX_TYPE)) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xbr_f, arr->arr_len*sizeof(COMPLEX_TYPE)) );

  if (opts->fftinterp == INT) { //interbinning
    CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa2_f, sett->nfft*sizeof(COMPLEX_TYPE)) );
    CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb2_f, sett->nfft*sizeof(COMPLEX_TYPE)) );
  }

  sett->nfftf = sett->fftpad*sett->nfft;

  if (opts->fftinterp == INT) { //interbinning
    cufftPlan1d( &(plans->plan),
		 sett->nfft,
		 CUFFT_TRANSFORM_TYPE, 1);
  } else { //fft & zero padding
    cufftPlan1d( &(plans->plan),
		 sett->nfftf,
		 CUFFT_TRANSFORM_TYPE, 1);

  }

  //plans for interpolation with splines

  cufftPlan1d(&(plans->pl_int),
	      sett->nfft,
	      CUFFT_Z2Z, 1);

  cufftPlan1d(&(plans->pl_inv),
	      sett->Ninterp,
	      CUFFT_Z2Z, 1);

  /*
    ############ FFT plans end ################
  */

}
Пример #2
0
/*!
	computes the N-point DFT of signal X and stores in Y using CUDA's FFT library
*/
bool cuda_dft(cuComplex *Y, cuComplex *X, float scale, int N) {
	size_t bytes = (size_t)N * sizeof(cuComplex);
	cuComplex *Y_gpu, *X_gpu;
	
	cudaMalloc((void **)&Y_gpu, bytes);
	cudaMalloc((void **)&X_gpu, bytes);

	cudaMemcpy(Y_gpu, Y, bytes, cudaMemcpyHostToDevice);
	cudaMemcpy(X_gpu, X, bytes, cudaMemcpyHostToDevice);

	cufftHandle plan;
	cufftPlan1d(&plan, N, CUFFT_C2C, 1);

	cufftExecC2C(plan, X_gpu, Y_gpu, CUFFT_FORWARD);

	cufftDestroy(plan);

	cudaMemcpy(Y, Y_gpu, bytes, cudaMemcpyDeviceToHost);

	cudaFree(Y_gpu);
	cudaFree(X_gpu);

	for (int n = 0; n < N; n++) {
		Y[n].x *= scale;
		Y[n].y *= scale;
	}

	return true;
}
Пример #3
0
extern "C" void cuda_fft(double *d_data, int Lx, int Ny, void *stream) 
{ 
	cufftHandle plan; 
	cufftPlan1d(&plan, Lx, CUFFT_Z2Z, Ny); 
	cufftSetStream(plan, (cudaStream_t)stream); 
	cufftExecZ2Z(plan, (cufftDoubleComplex*)d_data, (cufftDoubleComplex*)d_data,CUFFT_FORWARD); 
	cufftDestroy(plan); 
} 
Пример #4
0
extern "C" void cuda_fft(float *d_data, int Lx, int Ny, void *stream)
{
	cufftHandle plan;
	cufftPlan1d(&plan, Lx, CUFFT_C2C, Ny);
	cufftSetStream(plan, (cudaStream_t)stream);
	cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD);
	cufftDestroy(plan);
}
Пример #5
0
Файл: cu2.c Проект: E-LLP/QuIP
void g_fwdfft(QSP_ARG_DECL  Data_Obj *dst_dp, Data_Obj *src1_dp)
{
	//Variable declarations
	int NX = 256;
	//int BATCH = 10;
	int BATCH = 1;
	cufftResult_t status;

	//Declare plan for FFT
	cufftHandle plan;
	//cufftComplex *data;
	//cufftComplex *result;
	void *data;
	void *result;
	cudaError_t drv_err;

	//Allocate RAM
	//cutilSafeCall(cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH));	
	//cutilSafeCall(cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH));
	drv_err = cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH);
	if( drv_err != cudaSuccess ){
		WARN("error allocating cuda data buffer for fft!?");
		return;
	}
	drv_err = cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH);
	if( drv_err != cudaSuccess ){
		WARN("error allocating cuda result buffer for fft!?");
		// BUG clean up previous malloc...
		return;
	}

	//Create plan for FFT
	status = cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
	if (status != CUFFT_SUCCESS) {
		sprintf(ERROR_STRING, "Error in cufftPlan1d: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}

	//Run forward fft on data
	status = cufftExecC2C(plan, (cufftComplex *)data,
			(cufftComplex *)result, CUFFT_FORWARD);
	if (status != CUFFT_SUCCESS) {
		sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}

	//Run inverse fft on data
	/*status = cufftExecC2C(plan, data, result, CUFFT_INVERSE);
	if (status != CUFFT_SUCCESS)
	{
		sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}*/

	//Free resources
	cufftDestroy(plan);
	cudaFree(data);
}
Пример #6
0
/*
 * Function to be called in thread managing host operations and invoking kernels
 */
void* host_thread(void* passing_ptr) {
    DataArray* data_arr_ptr = (DataArray*) passing_ptr;

    alloc_data_host(data_arr_ptr);
    printf("data allocated by host thread\n");

    //printf("data filling by host thread\n");
    for (uint64_t ii = 0; ii < data_arr_ptr->size; ii++) {
        (*(data_arr_ptr->data_r))[ii] = ii;
        (*(data_arr_ptr->data_k))[ii] = data_arr_ptr->size-ii;
    }
    printf("data filled by host thread\n");

    // synchronize after allocating memory - streams should be created, mem on device ready for copying
    pthread_barrier_wait (&barrier);
    printf("1st barier host thread - allocating mem on cpu\n");





    //  here we can make cufft plan, for example
    cufftHandle plan_forward;
    cufftPlan1d(&plan_forward, N, CUFFT_Z2Z, 1);



    // synchornize after ... - data should be copyied on device
    pthread_barrier_wait (&barrier);
    printf("2nd barier host thread - \n");


    // run some computations
    cufftExecZ2Z(plan_forward, *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_k_dev), CUFFT_FORWARD);
    printf("cufft done\n");

    // synchornize after computations -

    cudaDeviceSynchronize(); // should be used on
    pthread_barrier_wait (&barrier);
    printf("3rd barier host thread - \n");



    // synchornize after computations -
    pthread_barrier_wait (&barrier);
    printf("4th barier host thread - \n");

    printf("data visible in host thread:\n");
    /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) {
      printf("%lu.\t",ii);
      printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] ));
      printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] ));
    }*/

    printf("closing host thread\n");
    pthread_exit(NULL);
}
Пример #7
0
void cuda_make_cufft_plan(cufftHandle *plan, int type, size_t n, size_t batch)
{
  cufftType cufft_type = CUFFT_C2C;
  if (type == 0)
    cufft_type = CUFFT_C2C;
  else if (type == 1)
    cufft_type = CUFFT_R2C;
  else if (type == 2)
    cufft_type = CUFFT_C2R;

  cudaError_t status = cufftPlan1d(plan, n, cufft_type, batch);
  check_error(status);
}
Пример #8
0
void
Fastconv_base<D, T, C>::fconv
  (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel)
{
  // convert pointers to types the CUFFT library accepts
  typedef cufftComplex ctype;
  ctype* d_out = reinterpret_cast<ctype*>(out);
  ctype* d_kernel = const_cast<ctype*>(reinterpret_cast<ctype const*>(kernel));
  ctype* d_in = const_cast<ctype*>(reinterpret_cast<ctype const*>(in));

  cufftHandle plan;
  if (transform_kernel)
  {
    // Create a 1D FFT plan and transform the kernel
    cufftPlan1d(&plan, columns, CUFFT_C2C, 1);
    cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD);
    cufftDestroy(plan);
  }

  // Create a FFTM plan
  cufftPlan1d(&plan, columns, CUFFT_C2C, rows);

  // transform the data
  cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD);

  // convolve with kernel, combine with scaling needed for inverse FFT
  typedef typename impl::scalar_of<T>::type scalar_type;
  scalar_type scale = 1 / static_cast<scalar_type>(columns);
  if (D == 1)
    vmmuls_row(kernel, in, out, scale, rows, columns);
  else
    mmmuls(kernel, in, out, scale, rows, columns);

  // inverse transform the signal
  cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE);
  cufftDestroy(plan);
}
Пример #9
0
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftPlan1dNative
 * Signature: (Ljcuda/jcufft/JCufftHandle;III)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftPlan1dNative
  (JNIEnv *env, jclass cla, jobject handle, jint nx, jint type, jint batch)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftPlan1d");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Creating 1D plan for %d elements of type %d\n", nx, type);

    cufftHandle plan = env->GetIntField(handle, cufftHandle_plan);
    cufftResult result = cufftPlan1d(&plan, nx, getCufftType(type), batch);
    env->SetIntField(handle, cufftHandle_plan, plan);
    return result;
}
Пример #10
0
void
mexFunction( int nlhs, mxArray *plhs[],
             int nrhs, const  mxArray *prhs[] )
{


  if (nrhs!=4)
    mexErrMsgTxt("Wrong number of arguments");


  cufftHandle plan =  (cufftHandle) mxGetScalar(prhs[0]);
  int nx  =  (int) mxGetScalar(prhs[1]);
  cufftType_t type = (cufftType_t) ((int) mxGetScalar(prhs[2]));
  int batch  =  (int) mxGetScalar(prhs[3]);

  cufftResult status = cufftPlan1d(&plan, nx, type, batch);

  plhs[0] = mxCreateDoubleScalar(status);
  if (nlhs>1)
    plhs[1] = mxCreateDoubleScalar(plan);



}
Пример #11
0
oskar_FFT* oskar_fft_create(int precision, int location, int num_dim,
        int dim_size, int batch_size_1d, int* status)
{
    int i;
    oskar_FFT* h = (oskar_FFT*) calloc(1, sizeof(oskar_FFT));
#ifndef OSKAR_HAVE_CUDA
    if (location == OSKAR_GPU) location = OSKAR_CPU;
#endif
#ifndef OSKAR_HAVE_OPENCL
    if (location & OSKAR_CL) location = OSKAR_CPU;
#endif
    h->precision = precision;
    h->location = location;
    h->num_dim = num_dim;
    h->dim_size = dim_size;
    h->ensure_consistent_norm = 1;
    h->num_cells_total = (size_t) dim_size;
    for (i = 1; i < num_dim; ++i) h->num_cells_total *= (size_t) dim_size;
    if (location == OSKAR_CPU)
    {
        int len = 4 * dim_size +
                2 * (int)(log((double)dim_size) / log(2.0)) + 8;
        h->fftpack_wsave = oskar_mem_create(precision, location, len, status);
        if (num_dim == 1)
        {
            (void) batch_size_1d;
            *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE;
        }
        else if (num_dim == 2)
        {
            if (precision == OSKAR_DOUBLE)
                oskar_fftpack_cfft2i(dim_size, dim_size,
                        oskar_mem_double(h->fftpack_wsave, status));
            else
                oskar_fftpack_cfft2i_f(dim_size, dim_size,
                        oskar_mem_float(h->fftpack_wsave, status));
        }
        else
            *status = OSKAR_ERR_INVALID_ARGUMENT;
        h->fftpack_work = oskar_mem_create(precision, location,
                2 * h->num_cells_total, status);
    }
    else if (location == OSKAR_GPU)
    {
#ifdef OSKAR_HAVE_CUDA
        if (num_dim == 1)
            cufftPlan1d(&h->cufft_plan, dim_size,
                    ((precision == OSKAR_DOUBLE) ? CUFFT_Z2Z : CUFFT_C2C),
                    batch_size_1d);
        else if (num_dim == 2)
            cufftPlan2d(&h->cufft_plan, dim_size, dim_size,
                    ((precision == OSKAR_DOUBLE) ? CUFFT_Z2Z : CUFFT_C2C));
        else
            *status = OSKAR_ERR_INVALID_ARGUMENT;
#endif
    }
    else if (location & OSKAR_CL)
    {
#ifdef OSKAR_HAVE_OPENCL
        *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE;
#endif
    }
    else
        *status = OSKAR_ERR_BAD_LOCATION;
    return h;
}
 /**
    Constructor.
    The constructor creates a CUFFT plan with the given size.
    @param size0 requested size of CUFFT plan
    @param batch number of 1D transforms
 */
 inline Plan(size_t size0, int batch = 1)
 {
   CUFFT_CHECK(cufftPlan1d(&plan, size0, CUFFT_C2R, batch));
 }
 /**
    Constructor.
    The constructor creates a CUFFT plan with the given size.
    @param size requested size of CUFFT plan
    @param batch number of 1D transforms
 */
 inline Plan(const Size<1> &size, int batch = 1)
 {
   CUFFT_CHECK(cufftPlan1d(&plan, size[0], CUFFT_C2R, batch));
 }
Пример #14
0
void
Fastconv_base<D, T, ComplexFmt>::fconv
  (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel)
{
  size_t kernel_size = (D == 1) ? columns : rows * columns;

  // allocate device memory and copy input and kernel over from host
  Device_storage<T> dev_out(rows * columns);
  Device_storage<T> dev_kernel(kernel_size);
  Device_storage<T> dev_in(rows * columns);

  // If the kernel is a matrix, it is assumed to be row-major and dense.
  // As a result, it can be copied as one contiguous chunk.
  cudaMemcpy(
    dev_kernel.data(),
    kernel,
    kernel_size * sizeof(T), 
    cudaMemcpyHostToDevice);
  ASSERT_CUDA_OK();

  // Transfer the input (row major, dense)
  cudaMemcpy(
    dev_in.data(),
    in,
    rows * columns * sizeof(T), 
    cudaMemcpyHostToDevice);
  ASSERT_CUDA_OK();
 

  // convert pointers to types the CUFFT library accepts
  typedef cufftComplex ctype;
  ctype* d_out = reinterpret_cast<ctype*>(dev_out.data());
  ctype* d_kernel = reinterpret_cast<ctype*>(dev_kernel.data());
  ctype* d_in = reinterpret_cast<ctype*>(dev_in.data());

  cufftHandle plan;
  if (transform_kernel)
  {
    // Create a 1D FFT plan and transform the kernel
    cufftPlan1d(&plan, columns, CUFFT_C2C, 1);
    cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD);
    cufftDestroy(plan);
  }

  // Create a FFTM plan
  cufftPlan1d(&plan, columns, CUFFT_C2C, rows);

  // transform the data
  cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD);

  // convolve with kernel, combine with scaling needed for inverse FFT
  typedef typename impl::Scalar_of<T>::type scalar_type;
  scalar_type scale = 1 / static_cast<scalar_type>(columns);
  if (D == 1)
    vmmuls_row_cc(d_kernel, d_in, d_out, scale, rows, columns);
  else
    mmmuls_cc(d_kernel, d_in, d_out, scale, rows, columns);

  // inverse transform the signal
  cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE);
  cufftDestroy(plan);

  // Move data back to the host from the output buffer
  cudaMemcpy(
    out,
    dev_out.data(),
    rows * columns * sizeof(T), 
    cudaMemcpyDeviceToHost);
  ASSERT_CUDA_OK();
}
Пример #15
0
int main2(int sockfd)
{
        cufftHandle plan;
        cufftComplex *devPtr;
        cufftReal indata[NX*BATCH];
        cufftComplex data[NX*BATCH];
        int i,timer,j,k;
        char fname[15];
        FILE *f;
	#define BUFSIZE (21*4096*sizeof(int))
	int buffer[BUFSIZE];

        int p,nread;


	f = fopen("21-4096","rb");
	nread=fread(buffer,BUFSIZE,1,f);
	printf("nread=%i\n",nread);
	fclose(f);

        i=0;
        for (j=0;j<BATCH;j++) {
            for (k=0;k<NX;k++) {
                data[j*NX+k].x = buffer[j*NX+k];
                data[j*NX+k].y = 0;
            }
	}


        //f=fopen("y.txt","r");
    /* source data creation */

        //int sockfd = myconnect();
        //printf("connected\n");
	
		

        /* WORKING!!!!!!!!
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"r");
            for (k=0;k<NX;k++) {
                fscanf(f,"%i\n",&p);
                data[j*NX+k].x = p;
                data[j*NX+k].y = 0;
            }
            fclose(f);
	*/
/*
        for(i=  0 ; i < NX*BATCH ; i++){
                //fscanf(f,"%i\n",&p);
                //data[i].x= p;
                data[i].x= 1.0f;
                //printf("%f\n",data[i].x);
                data[i].y = 0.0f;
        }
        //fclose(f)
        */
        //}


        /* creates 1D FFT plan */
        cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);


        /*
        cutCreateTimer(&timer);
        cutResetTimer(timer);
        cutStartTimer(timer);
        */
        
    /* GPU memory allocation */
        cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH);

    /* transfer to GPU memory */
        cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);


        /* executes FFT processes */
        cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);

        /* executes FFT processes (inverse transformation) */
       //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE);

    /* transfer results from GPU memory */
        cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost);

        /* deletes CUFFT plan */
        cufftDestroy(plan);

    /* frees GPU memory */
        cudaFree(devPtr);

        /*
        cudaThreadSynchronize();
        cutStopTimer(timer);
        printf("%f\n",cutGetTimerValue(timer)/(float)1000);
        cutDeleteTimer(timer);
        */

        /*
        float mag;
        for(i = 0 ; i < NX*BATCH ; i++){
                //printf("data[%d] %f %f\n", i, data[i].x, data[i].y);
                //printf("%f\n", data[i].x);
                mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                printf("%f\n",mag);

        }
        */

/*
        // save as text file
        float mag;
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i-mag.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"w");
            for (k=0;k<NX;k++) {
                //fscanf(f,"%i\n",&p);
                if (k>50)
                    continue;
                i = j*NX+k;
                mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                fprintf(f,"%f\n",mag);
            }
            fclose(f);
        }
*/


        float mag;
        i=0;
        float mags[NX];
        int magsint[NX*BATCH];
        memset(magsint,0,sizeof(int)*NX*BATCH);
        int u = 0;

        printf("%f %f %f %f\n",data[0].x,data[1].x,data[2].x,data[3].x);

        //printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]);

//        f = fopen("ffts.bin","wb");
        for (j=0;j<BATCH;j++) {
//            sprintf(fname,"%i-bin.dat",j);
//            printf("%s\n",fname);

            for (k=0;k<NX;k++) {
                //fscanf(f,"%i\n",&p);
                if (k>50)
                    continue;
                i = j*NX+k;
                mags[k] = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                magsint[u]=mags[k]    ;
                u++;
                //fprintf(f,"%f\n",mag);
                
            }

            //f = fopen(fname,"wb");
  //          fwrite(magsint,sizeof(int)*50,1,f);
        }
        int n;
        n = write(sockfd,magsint,sizeof(int)*BATCH*50);
        printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]);
        printf("send ok, size: %i\n",n);
        //fclose(f);
        
        
        return 0;
}
Пример #16
0
int main(int argc, char *argv[]) {
	int i;
	struct timeval begin, end;
	int size;
	size_t bytes;
	int n = 0, m = 0;
	STARPUFFT(plan) plan;
#ifdef STARPU_HAVE_FFTW
	_FFTW(plan) fftw_plan;
#endif
#ifdef STARPU_USE_CUDA
	cufftHandle cuda_plan;
	cudaError_t cures;
#endif
	double timing;

	if (argc < 2 || argc > 3) {
		fprintf(stderr,"need one or two size of vector\n");
		exit(EXIT_FAILURE);
	}

	starpu_init(NULL);

	if (argc == 2) {
		n = atoi(argv[1]);

		/* 1D */
		size = n;
	} else if (argc == 3) {
		n = atoi(argv[1]);
		m = atoi(argv[2]);

		/* 2D */
		size = n * m;
	} else {
		assert(0);
	}

	bytes = size * sizeof(STARPUFFT(complex));

	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
	starpu_srand48(0);
	for (i = 0; i < size; i++)
		in[i] = starpu_drand48() + I * starpu_drand48();

	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
#endif

#ifdef STARPU_USE_CUDA
	STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda));
#endif

	if (argc == 2) {
		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
			printf("erf\n");
#endif

	} else if (argc == 3) {
		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
#endif
	} else {
		assert(0);
	}

#ifdef STARPU_HAVE_FFTW
	gettimeofday(&begin, NULL);
	_FFTW(execute)(fftw_plan);
	gettimeofday(&end, NULL);
	_FFTW(destroy_plan)(fftw_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif
#ifdef STARPU_USE_CUDA
	gettimeofday(&begin, NULL);
	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
		printf("erf2\n");
	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
		STARPU_CUDA_REPORT_ERROR(cures);
	gettimeofday(&end, NULL);
	cufftDestroy(cuda_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif

	STARPUFFT(execute)(plan, in, out);

	STARPUFFT(showstats)(stdout);
	STARPUFFT(destroy_plan)(plan);

	printf("\n");
#if 0
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
	printf("\n\n");
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
	printf("\n\n");
#ifdef STARPU_HAVE_FFTW
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
	printf("\n\n");
#endif
#endif

#ifdef STARPU_HAVE_FFTW
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

#ifdef STARPU_USE_CUDA
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out_cuda[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

	STARPUFFT(free)(in);
	STARPUFFT(free)(out);

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(free)(out_fftw);
#endif

#ifdef STARPU_USE_CUDA
	free(out_cuda);
#endif

	starpu_shutdown();

	return EXIT_SUCCESS;
}
Пример #17
0
int main(void)
{
  //std::cout << "Generating a time series on device "<< tim.get_nsamps() << std::endl;
  //DeviceTimeSeries<float> d_tim(8388608);
  //d_tim.set_tsamp(0.000064);
  TimeSeries<float> tim;
  tim.from_file("/lustre/home/ebarr/Soft/peasoup/tmp5.tim");
  DeviceTimeSeries<float> d_tim(tim);
  
  unsigned int size = d_tim.get_nsamps();
  
  TimeSeriesFolder folder(size);
  
  //DeviceTimeSeries<float> d_tim_r(fft_size); //<----for resampled data
  //TimeDomainResampler resampler;
  

  float* folded_buffer;
  cudaError_t error;
  cufftResult result;
  error = cudaMalloc((void**)&folded_buffer, sizeof(float)*size);
  ErrorChecker::check_cuda_error(error);


  unsigned nints = 64;
  unsigned nbins = 32;

  cufftComplex* fft_out;
  error = cudaMalloc((void**)&fft_out, sizeof(cufftComplex)*nints*nbins);
  cufftHandle plan;
  result = cufftPlan1d(&plan,nbins,CUFFT_R2C, nints);
  ErrorChecker::check_cufft_error(result);
  Stopwatch timer;

  FoldedSubints<float> folded_array(nbins,nints);
  //folder.fold(d_tim,folded_array,0.007453079228);

  std::cout << "made it here" << std::endl;
  

  FoldOptimiser optimiser(nbins,nints);
  timer.start();
  for (int ii=0;ii<1;ii++){
    //FoldedSubints<float> folded_array(nbins,nints);
    folder.fold(d_tim,folded_array,0.007453099228);
    Utils::dump_device_buffer<float>(folded_array.get_data(),nints*nbins,"original_fold.bin");
    
    optimiser.optimise(folded_array);
  }
  timer.stop();
  
  /*
  float* temp = new float [nints*nbins];
  
  cudaMemcpy(temp,folded_buffer,nints*nbins*sizeof(float),cudaMemcpyDeviceToHost);
  ErrorChecker::check_cuda_error();

  for (int ii=0;ii<nints*nbins;ii++)
    std::cout << temp[ii] << std::endl;
  */

  
  std::cout << "Total execution time (s): " << timer.getTime()<<std::endl;
  std::cout << "Average execution time (s): " << timer.getTime()/1000.0 << std::endl;



  return 0;
}
Пример #18
0
cufftResult WINAPI wine_cufftPlan1d( cufftHandle *plan, int nx, cufftType type, int batch ){
	WINE_TRACE("\n");
	return cufftPlan1d( plan, nx, type, batch );
}
Пример #19
0
void generate_PL_lines_fft(double PLindex, int nr_lines,int linelength, 
						   icl_buffer *y_buffer, //double *y, // out
						   int fielddim_z) 
{

	cufftHandle fftw_plan_transfer_fw, fftw_plan_noise_fw, fftw_plan_transfer_bw;

	// FFT preparation 
	const size_t out_size		 = sizeof(cufftDoubleComplex)*(linelength/2+1);
	const size_t in_size		 = sizeof(double)*(linelength+1);
	
	// memory allocation
	double *noise_in_host		 = (double*) malloc(in_size);  // all these allocs can technically also be done outside of the function, if performance is bad this might be a consideration	
	double *noise_in_device;
	checkCudaErrors(cudaMalloc((void **)&noise_in_device, in_size));

	cufftDoubleComplex *noise_out_host = (cufftDoubleComplex*) fftw_malloc(out_size); // check if i need 2 times +1
	cufftDoubleComplex *noise_out_device; 
	checkCudaErrors(cudaMalloc((void **)&noise_out_device, out_size));

	double *transfer_in_host 	 = (double*) malloc(in_size);  
	double *transfer_in_device;
	checkCudaErrors(cudaMalloc((void **)&transfer_in_device, in_size));
	
	cufftDoubleComplex *transfer_out_host = (cufftDoubleComplex*) fftw_malloc(out_size);	
	cufftDoubleComplex *transfer_out_device;
	checkCudaErrors(cudaMalloc((void **)&transfer_out_device, out_size));

	// plans
	cufftPlan1d(&fftw_plan_transfer_fw,	linelength, CUFFT_D2Z, 1);
	cufftPlan1d(&fftw_plan_noise_fw,	linelength, CUFFT_D2Z, 1);
	cufftPlan1d(&fftw_plan_transfer_bw,	linelength, CUFFT_Z2D, 1);	

	// XXX todo here we can potentially cufftPlanMany, to performa a batch of many plan1d fft
/*
	fftw_plan_transfer_fw = fftw_plan_dft_r2c_1d(linelength, transfer_in, transfer_out,FFTW_MEASURE); 
	fftw_plan_noise_fw    = fftw_plan_dft_r2c_1d(linelength, noise_in, noise_out,FFTW_MEASURE);     
	fftw_plan_transfer_bw = fftw_plan_dft_c2r_1d(linelength, transfer_out, transfer_in,FFTW_MEASURE);	
*/

	for(int v=0; v<nr_lines; v++){

		for(int i=0; i<linelength; i++){
			noise_in_host[i] = rand_01();
			transfer_in_host[i] = rand_01();
		}	

		Box_Mueller(linelength, noise_in_host, transfer_in_host);

		for(int i=0; i<linelength; i++){	
			noise_in_host[i] = 2*noise_in_host[i]-1; // around 0
			noise_in_host[i] = 5*noise_in_host[i];  //changes the deviation, which values need to be put will be investigated			
		}

		transfer_in_host[0]=1.0;
		for(int i=1; i<linelength; i++){
			transfer_in_host[i] = (transfer_in_host[i-1]/(i))*(i-1-(PLindex/2.0));
		}		

		/// (a) moving transfer_in and noise_in to the device
		cudaMemcpy(noise_in_device, noise_in_host, in_size, cudaMemcpyDeviceToHost);
		cudaMemcpy(transfer_in_device, transfer_in_host, in_size, cudaMemcpyDeviceToHost);
		
		//fftw_execute(fftw_plan_noise_fw);	
		//fftw_execute(fftw_plan_transfer_fw);
		cufftExecD2Z(fftw_plan_noise_fw, transfer_in_device, transfer_out_device);
		cufftExecD2Z(fftw_plan_noise_fw, noise_in_device, noise_out_device);

		/// (b) moving back transfer_out and noise out
		cudaMemcpy(transfer_out_host, transfer_out_device, out_size, cudaMemcpyHostToDevice);
		cudaMemcpy(noise_out_host, noise_out_device, out_size, cudaMemcpyHostToDevice);

		for(int i=0; i<0.5*linelength+1; i++){
			double temp = (transfer_out_host[i].x*noise_out_host[i].x+transfer_out_host[i].y*noise_out_host[i].y) / linelength;
			transfer_out_host[i].y = (transfer_out_host[i].x*noise_out_host[i].y-transfer_out_host[i].y*noise_out_host[i].x) / linelength;
			transfer_out_host[i].x = temp;
		}		

		/// (c) moving to the device transfer_out
		cudaMemcpy(transfer_out_device, transfer_out_host, out_size, cudaMemcpyDeviceToHost);

		// fftw_execute(fftw_plan_transfer_bw);	
		cufftExecZ2D(fftw_plan_transfer_bw, transfer_out_device, transfer_in_device);

		//// (d) moving back transfer_in 
		cudaMemcpy(transfer_in_host, transfer_in_device, in_size, cudaMemcpyHostToDevice);

		for(int i=0; i<linelength; i++){
			transfer_in_host[i] = transfer_in_host[i]/sqrt((double) linelength);
		}

		// xxx reduce max/avg
		double average=0;
		for(int i=0; i<linelength; i++){
			average = average + transfer_in_host[i];
		}
		average = average/linelength;


		double *y_host = (double*)malloc(sizeof(cl_double) * linelength * nr_lines + 1);
		size_t index = 0;
		for(int i=0; i<linelength; i++){
			y_host[index]=(transfer_in_host[i]-average);
			index++;
		}

		/// (e) write y_buf
		//icl_local_device* ldev = &local_devices[y_buffer->device->device_id];
		icl_local_buffer* lbuf = (icl_local_buffer*)(y_buffer->buffer_add);	
		cudaMemcpy((void*)lbuf->mem, transfer_out_host, out_size, cudaMemcpyDeviceToHost);
	}
	
	free(transfer_in_host);
	free(transfer_out_host);
	free(noise_out_host);
	free(noise_in_host);

	cudaFree(transfer_in_device);
	cudaFree(transfer_out_device);
	cudaFree(noise_out_device);
	cudaFree(noise_in_device);

	cufftDestroy(fftw_plan_transfer_fw);	
	cufftDestroy(fftw_plan_transfer_bw);
	cufftDestroy(fftw_plan_noise_fw);
//	fftw_cleanup();
}