/* * Class: jcuda_jcufft_JCufft * Method: cufftExecD2ZNative * Signature: (Ljcuda/jcufft/cufftHandle;Ljcuda/Pointer;Ljcuda/Pointer;)I */ JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftExecD2ZNative (JNIEnv *env, jclass cla, jobject handle, jobject rIdata, jobject cOdata) { if (handle == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftExecD2Z"); return JCUFFT_INTERNAL_ERROR; } if (rIdata == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'rIdata' is null for cufftExecD2Z"); return JCUFFT_INTERNAL_ERROR; } if (cOdata == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cOdata' is null for cufftExecD2Z"); return JCUFFT_INTERNAL_ERROR; } Logger::log(LOG_TRACE, "Executing cufftExecD2Z\n"); cufftHandle nativePlan = env->GetIntField(handle, cufftHandle_plan); double* nativeRIData = (double*)getPointer(env, rIdata); cufftDoubleComplex* nativeCOData = (cufftDoubleComplex*)getPointer(env, cOdata); cufftResult result = cufftExecD2Z(nativePlan, nativeRIData, nativeCOData); return result; }
void accfft_execute_gpu(accfft_plan_gpu* plan, int direction,double * data_d, double * data_out_d, double * timer,std::bitset<3> xyz){ if(data_d==NULL) data_d=plan->data; if(data_out_d==NULL) data_out_d=plan->data_out; int * coords=plan->coord; int procid=plan->procid; double fft_time=0; double timings[5]={0}; cudaEvent_t memcpy_startEvent, memcpy_stopEvent; cudaEvent_t fft_startEvent, fft_stopEvent; checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) ); checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) ); checkCuda_accfft( cudaEventCreate(&fft_startEvent) ); checkCuda_accfft( cudaEventCreate(&fft_stopEvent) ); int NY=plan->N[1]; float dummy_time=0; int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1; //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i; //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; if(direction==-1){ /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // FFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecD2Z(plan->fplan_0,(cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; // Perform N0/P0 transpose if(!plan->oneD){ plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]); } /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1[0];++i){ checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(plan->oneD){ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2); } else{ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]); } /**************************************************************/ /******************* N0 x N1/P0 x N2/P1 **********************/ /**************************************************************/ MPI_Barrier(plan->c_comm); if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } } else if (direction==1){ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(plan->oneD){ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1); } else{ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1,1,coords[1]); } MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1i[0];++i){ checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(!plan->oneD){ plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1,osize_1i[0],coords[0]); } /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // IFFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2D(plan->iplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; } timings[4]+=fft_time; if(timer==NULL){ //delete [] timings; } else{ timer[0]+=timings[0]; timer[1]+=timings[1]; timer[2]+=timings[2]; timer[3]+=timings[3]; timer[4]+=timings[4]; } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); return; }// end accfft_execute_gpu
cufftResult WINAPI wine_cufftExecD2Z( cufftHandle plan, cufftDoubleReal *idata, cufftDoubleComplex *odata ){ WINE_TRACE("\n"); return cufftExecD2Z( plan, idata, odata ); }
void generate_PL_lines_fft(double PLindex, int nr_lines,int linelength, icl_buffer *y_buffer, //double *y, // out int fielddim_z) { cufftHandle fftw_plan_transfer_fw, fftw_plan_noise_fw, fftw_plan_transfer_bw; // FFT preparation const size_t out_size = sizeof(cufftDoubleComplex)*(linelength/2+1); const size_t in_size = sizeof(double)*(linelength+1); // memory allocation double *noise_in_host = (double*) malloc(in_size); // all these allocs can technically also be done outside of the function, if performance is bad this might be a consideration double *noise_in_device; checkCudaErrors(cudaMalloc((void **)&noise_in_device, in_size)); cufftDoubleComplex *noise_out_host = (cufftDoubleComplex*) fftw_malloc(out_size); // check if i need 2 times +1 cufftDoubleComplex *noise_out_device; checkCudaErrors(cudaMalloc((void **)&noise_out_device, out_size)); double *transfer_in_host = (double*) malloc(in_size); double *transfer_in_device; checkCudaErrors(cudaMalloc((void **)&transfer_in_device, in_size)); cufftDoubleComplex *transfer_out_host = (cufftDoubleComplex*) fftw_malloc(out_size); cufftDoubleComplex *transfer_out_device; checkCudaErrors(cudaMalloc((void **)&transfer_out_device, out_size)); // plans cufftPlan1d(&fftw_plan_transfer_fw, linelength, CUFFT_D2Z, 1); cufftPlan1d(&fftw_plan_noise_fw, linelength, CUFFT_D2Z, 1); cufftPlan1d(&fftw_plan_transfer_bw, linelength, CUFFT_Z2D, 1); // XXX todo here we can potentially cufftPlanMany, to performa a batch of many plan1d fft /* fftw_plan_transfer_fw = fftw_plan_dft_r2c_1d(linelength, transfer_in, transfer_out,FFTW_MEASURE); fftw_plan_noise_fw = fftw_plan_dft_r2c_1d(linelength, noise_in, noise_out,FFTW_MEASURE); fftw_plan_transfer_bw = fftw_plan_dft_c2r_1d(linelength, transfer_out, transfer_in,FFTW_MEASURE); */ for(int v=0; v<nr_lines; v++){ for(int i=0; i<linelength; i++){ noise_in_host[i] = rand_01(); transfer_in_host[i] = rand_01(); } Box_Mueller(linelength, noise_in_host, transfer_in_host); for(int i=0; i<linelength; i++){ noise_in_host[i] = 2*noise_in_host[i]-1; // around 0 noise_in_host[i] = 5*noise_in_host[i]; //changes the deviation, which values need to be put will be investigated } transfer_in_host[0]=1.0; for(int i=1; i<linelength; i++){ transfer_in_host[i] = (transfer_in_host[i-1]/(i))*(i-1-(PLindex/2.0)); } /// (a) moving transfer_in and noise_in to the device cudaMemcpy(noise_in_device, noise_in_host, in_size, cudaMemcpyDeviceToHost); cudaMemcpy(transfer_in_device, transfer_in_host, in_size, cudaMemcpyDeviceToHost); //fftw_execute(fftw_plan_noise_fw); //fftw_execute(fftw_plan_transfer_fw); cufftExecD2Z(fftw_plan_noise_fw, transfer_in_device, transfer_out_device); cufftExecD2Z(fftw_plan_noise_fw, noise_in_device, noise_out_device); /// (b) moving back transfer_out and noise out cudaMemcpy(transfer_out_host, transfer_out_device, out_size, cudaMemcpyHostToDevice); cudaMemcpy(noise_out_host, noise_out_device, out_size, cudaMemcpyHostToDevice); for(int i=0; i<0.5*linelength+1; i++){ double temp = (transfer_out_host[i].x*noise_out_host[i].x+transfer_out_host[i].y*noise_out_host[i].y) / linelength; transfer_out_host[i].y = (transfer_out_host[i].x*noise_out_host[i].y-transfer_out_host[i].y*noise_out_host[i].x) / linelength; transfer_out_host[i].x = temp; } /// (c) moving to the device transfer_out cudaMemcpy(transfer_out_device, transfer_out_host, out_size, cudaMemcpyDeviceToHost); // fftw_execute(fftw_plan_transfer_bw); cufftExecZ2D(fftw_plan_transfer_bw, transfer_out_device, transfer_in_device); //// (d) moving back transfer_in cudaMemcpy(transfer_in_host, transfer_in_device, in_size, cudaMemcpyHostToDevice); for(int i=0; i<linelength; i++){ transfer_in_host[i] = transfer_in_host[i]/sqrt((double) linelength); } // xxx reduce max/avg double average=0; for(int i=0; i<linelength; i++){ average = average + transfer_in_host[i]; } average = average/linelength; double *y_host = (double*)malloc(sizeof(cl_double) * linelength * nr_lines + 1); size_t index = 0; for(int i=0; i<linelength; i++){ y_host[index]=(transfer_in_host[i]-average); index++; } /// (e) write y_buf //icl_local_device* ldev = &local_devices[y_buffer->device->device_id]; icl_local_buffer* lbuf = (icl_local_buffer*)(y_buffer->buffer_add); cudaMemcpy((void*)lbuf->mem, transfer_out_host, out_size, cudaMemcpyDeviceToHost); } free(transfer_in_host); free(transfer_out_host); free(noise_out_host); free(noise_in_host); cudaFree(transfer_in_device); cudaFree(transfer_out_device); cudaFree(noise_out_device); cudaFree(noise_in_device); cufftDestroy(fftw_plan_transfer_fw); cufftDestroy(fftw_plan_transfer_bw); cufftDestroy(fftw_plan_noise_fw); // fftw_cleanup(); }