Esempio n. 1
0
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftExecD2ZNative
 * Signature: (Ljcuda/jcufft/cufftHandle;Ljcuda/Pointer;Ljcuda/Pointer;)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftExecD2ZNative
  (JNIEnv *env, jclass cla, jobject handle, jobject rIdata, jobject cOdata)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftExecD2Z");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (rIdata == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'rIdata' is null for cufftExecD2Z");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (cOdata == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cOdata' is null for cufftExecD2Z");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Executing cufftExecD2Z\n");

    cufftHandle nativePlan = env->GetIntField(handle, cufftHandle_plan);
    double* nativeRIData = (double*)getPointer(env, rIdata);
    cufftDoubleComplex* nativeCOData = (cufftDoubleComplex*)getPointer(env, cOdata);

    cufftResult result = cufftExecD2Z(nativePlan, nativeRIData, nativeCOData);
    return result;
}
Esempio n. 2
0
void accfft_execute_gpu(accfft_plan_gpu* plan, int direction,double * data_d, double * data_out_d, double * timer,std::bitset<3> xyz){

  if(data_d==NULL)
    data_d=plan->data;
  if(data_out_d==NULL)
    data_out_d=plan->data_out;

  int * coords=plan->coord;
  int procid=plan->procid;
  double fft_time=0;
  double timings[5]={0};

  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
  cudaEvent_t fft_startEvent, fft_stopEvent;
  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
  int NY=plan->N[1];
  float dummy_time=0;


  int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1;
  //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i;
  //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  if(direction==-1){
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/
    // FFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecD2Z(plan->fplan_0,(cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;
    // Perform N0/P0 transpose


    if(!plan->oneD){
      plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1[0];++i){
        checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    if(plan->oneD){
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2);
    }
    else{
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]);
    }
    /**************************************************************/
    /*******************  N0 x N1/P0 x N2/P1 **********************/
    /**************************************************************/
    MPI_Barrier(plan->c_comm);
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
  }
  else if (direction==1){
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    if(plan->oneD){
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1);
    }
    else{
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1,1,coords[1]);
    }
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1i[0];++i){
        checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }



    if(!plan->oneD){
      plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1,osize_1i[0],coords[0]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/

    // IFFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2D(plan->iplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

  }

  timings[4]+=fft_time;
  if(timer==NULL){
    //delete [] timings;
  }
  else{
    timer[0]+=timings[0];
    timer[1]+=timings[1];
    timer[2]+=timings[2];
    timer[3]+=timings[3];
    timer[4]+=timings[4];
  }
  checkCuda_accfft (cudaDeviceSynchronize());
  MPI_Barrier(plan->c_comm);

  return;
}// end accfft_execute_gpu
Esempio n. 3
0
cufftResult WINAPI wine_cufftExecD2Z( cufftHandle plan, cufftDoubleReal *idata, cufftDoubleComplex *odata ){
	WINE_TRACE("\n");
	return cufftExecD2Z( plan, idata, odata );
}
Esempio n. 4
0
void generate_PL_lines_fft(double PLindex, int nr_lines,int linelength, 
						   icl_buffer *y_buffer, //double *y, // out
						   int fielddim_z) 
{

	cufftHandle fftw_plan_transfer_fw, fftw_plan_noise_fw, fftw_plan_transfer_bw;

	// FFT preparation 
	const size_t out_size		 = sizeof(cufftDoubleComplex)*(linelength/2+1);
	const size_t in_size		 = sizeof(double)*(linelength+1);
	
	// memory allocation
	double *noise_in_host		 = (double*) malloc(in_size);  // all these allocs can technically also be done outside of the function, if performance is bad this might be a consideration	
	double *noise_in_device;
	checkCudaErrors(cudaMalloc((void **)&noise_in_device, in_size));

	cufftDoubleComplex *noise_out_host = (cufftDoubleComplex*) fftw_malloc(out_size); // check if i need 2 times +1
	cufftDoubleComplex *noise_out_device; 
	checkCudaErrors(cudaMalloc((void **)&noise_out_device, out_size));

	double *transfer_in_host 	 = (double*) malloc(in_size);  
	double *transfer_in_device;
	checkCudaErrors(cudaMalloc((void **)&transfer_in_device, in_size));
	
	cufftDoubleComplex *transfer_out_host = (cufftDoubleComplex*) fftw_malloc(out_size);	
	cufftDoubleComplex *transfer_out_device;
	checkCudaErrors(cudaMalloc((void **)&transfer_out_device, out_size));

	// plans
	cufftPlan1d(&fftw_plan_transfer_fw,	linelength, CUFFT_D2Z, 1);
	cufftPlan1d(&fftw_plan_noise_fw,	linelength, CUFFT_D2Z, 1);
	cufftPlan1d(&fftw_plan_transfer_bw,	linelength, CUFFT_Z2D, 1);	

	// XXX todo here we can potentially cufftPlanMany, to performa a batch of many plan1d fft
/*
	fftw_plan_transfer_fw = fftw_plan_dft_r2c_1d(linelength, transfer_in, transfer_out,FFTW_MEASURE); 
	fftw_plan_noise_fw    = fftw_plan_dft_r2c_1d(linelength, noise_in, noise_out,FFTW_MEASURE);     
	fftw_plan_transfer_bw = fftw_plan_dft_c2r_1d(linelength, transfer_out, transfer_in,FFTW_MEASURE);	
*/

	for(int v=0; v<nr_lines; v++){

		for(int i=0; i<linelength; i++){
			noise_in_host[i] = rand_01();
			transfer_in_host[i] = rand_01();
		}	

		Box_Mueller(linelength, noise_in_host, transfer_in_host);

		for(int i=0; i<linelength; i++){	
			noise_in_host[i] = 2*noise_in_host[i]-1; // around 0
			noise_in_host[i] = 5*noise_in_host[i];  //changes the deviation, which values need to be put will be investigated			
		}

		transfer_in_host[0]=1.0;
		for(int i=1; i<linelength; i++){
			transfer_in_host[i] = (transfer_in_host[i-1]/(i))*(i-1-(PLindex/2.0));
		}		

		/// (a) moving transfer_in and noise_in to the device
		cudaMemcpy(noise_in_device, noise_in_host, in_size, cudaMemcpyDeviceToHost);
		cudaMemcpy(transfer_in_device, transfer_in_host, in_size, cudaMemcpyDeviceToHost);
		
		//fftw_execute(fftw_plan_noise_fw);	
		//fftw_execute(fftw_plan_transfer_fw);
		cufftExecD2Z(fftw_plan_noise_fw, transfer_in_device, transfer_out_device);
		cufftExecD2Z(fftw_plan_noise_fw, noise_in_device, noise_out_device);

		/// (b) moving back transfer_out and noise out
		cudaMemcpy(transfer_out_host, transfer_out_device, out_size, cudaMemcpyHostToDevice);
		cudaMemcpy(noise_out_host, noise_out_device, out_size, cudaMemcpyHostToDevice);

		for(int i=0; i<0.5*linelength+1; i++){
			double temp = (transfer_out_host[i].x*noise_out_host[i].x+transfer_out_host[i].y*noise_out_host[i].y) / linelength;
			transfer_out_host[i].y = (transfer_out_host[i].x*noise_out_host[i].y-transfer_out_host[i].y*noise_out_host[i].x) / linelength;
			transfer_out_host[i].x = temp;
		}		

		/// (c) moving to the device transfer_out
		cudaMemcpy(transfer_out_device, transfer_out_host, out_size, cudaMemcpyDeviceToHost);

		// fftw_execute(fftw_plan_transfer_bw);	
		cufftExecZ2D(fftw_plan_transfer_bw, transfer_out_device, transfer_in_device);

		//// (d) moving back transfer_in 
		cudaMemcpy(transfer_in_host, transfer_in_device, in_size, cudaMemcpyHostToDevice);

		for(int i=0; i<linelength; i++){
			transfer_in_host[i] = transfer_in_host[i]/sqrt((double) linelength);
		}

		// xxx reduce max/avg
		double average=0;
		for(int i=0; i<linelength; i++){
			average = average + transfer_in_host[i];
		}
		average = average/linelength;


		double *y_host = (double*)malloc(sizeof(cl_double) * linelength * nr_lines + 1);
		size_t index = 0;
		for(int i=0; i<linelength; i++){
			y_host[index]=(transfer_in_host[i]-average);
			index++;
		}

		/// (e) write y_buf
		//icl_local_device* ldev = &local_devices[y_buffer->device->device_id];
		icl_local_buffer* lbuf = (icl_local_buffer*)(y_buffer->buffer_add);	
		cudaMemcpy((void*)lbuf->mem, transfer_out_host, out_size, cudaMemcpyDeviceToHost);
	}
	
	free(transfer_in_host);
	free(transfer_out_host);
	free(noise_out_host);
	free(noise_in_host);

	cudaFree(transfer_in_device);
	cudaFree(transfer_out_device);
	cudaFree(noise_out_device);
	cudaFree(noise_in_device);

	cufftDestroy(fftw_plan_transfer_fw);	
	cufftDestroy(fftw_plan_transfer_bw);
	cufftDestroy(fftw_plan_noise_fw);
//	fftw_cleanup();
}