JNIEXPORT jlong JNICALL Java_ffx_numerics_fft_Complex3DOpenCL_setupNative (JNIEnv *env, jclass object) { clfftStatus_ err; clfftSetupData fftSetup; clfftSetupData* fftSetupPtr; err = clfftInitSetupData(&fftSetup); //printf(" clfftInitSetupData: %d\n", err); err = clfftSetup(&fftSetup); //printf(" clfftSetup: %d\n", err); fftSetupPtr = &fftSetup; return ((jlong) fftSetupPtr); }
setup() { ASSERT_THROW_CL(clfftInitSetupData(&_setup_data)); ASSERT_THROW_CL(clfftSetup (&_setup_data)); }
void FC_FUNC_(clfftsetup_low, CLFFTSETUP_LOW)(int * status){ clfftSetupData setup_data; *status = clfftInitSetupData(&setup_data); *status = clfftSetup(&setup_data); }
int _tmain( int argc, _TCHAR* argv[] ) { // This helps with mixing output of both wide and narrow characters to the screen std::ios::sync_with_stdio( false ); // Define MEMORYREPORT on windows platfroms to enable debug memory heap checking #if defined( MEMORYREPORT ) && defined( _WIN32 ) TCHAR logPath[ MAX_PATH ]; ::GetCurrentDirectory( MAX_PATH, logPath ); ::_tcscat_s( logPath, _T( "\\MemoryReport.txt") ); // We leak the handle to this file, on purpose, so that the ::_CrtSetReportFile() can output it's memory // statistics on app shutdown HANDLE hLogFile; hLogFile = ::CreateFile( logPath, GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL ); ::_CrtSetReportMode( _CRT_ASSERT, _CRTDBG_MODE_FILE | _CRTDBG_MODE_WNDW | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportMode( _CRT_ERROR, _CRTDBG_MODE_FILE | _CRTDBG_MODE_WNDW | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportMode( _CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportFile( _CRT_ASSERT, hLogFile ); ::_CrtSetReportFile( _CRT_ERROR, hLogFile ); ::_CrtSetReportFile( _CRT_WARN, hLogFile ); int tmp = ::_CrtSetDbgFlag( _CRTDBG_REPORT_FLAG ); tmp |= _CRTDBG_LEAK_CHECK_DF | _CRTDBG_ALLOC_MEM_DF | _CRTDBG_CHECK_ALWAYS_DF; ::_CrtSetDbgFlag( tmp ); // By looking at the memory leak report that is generated by this debug heap, there is a number with // {} brackets that indicates the incremental allocation number of that block. If you wish to set // a breakpoint on that allocation number, put it in the _CrtSetBreakAlloc() call below, and the heap // will issue a bp on the request, allowing you to look at the call stack // ::_CrtSetBreakAlloc( 1833 ); #endif /* MEMORYREPORT */ // OpenCL state cl_device_type deviceType = CL_DEVICE_TYPE_ALL; cl_int deviceId = 0; cl_int platformId = 0; // FFT state clfftResultLocation place = CLFFT_INPLACE; clfftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED; clfftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED; clfftPrecision precision = CLFFT_SINGLE; clfftDirection dir = CLFFT_FORWARD; size_t lengths[ 3 ] = {1,1,1}; size_t iStrides[ 4 ] = {0,0,0,0}; size_t oStrides[ 4 ] = {0,0,0,0}; cl_uint profile_count = 0; cl_uint command_queue_flags = 0; size_t batchSize = 1; // Initialize flags for FFT library std::auto_ptr< clfftSetupData > setupData( new clfftSetupData ); OPENCL_V_THROW( clfftInitSetupData( setupData.get( ) ), "clfftInitSetupData failed" ); try { // Declare the supported options. po::options_description desc( "clFFT client command line options" ); desc.add_options() ( "help,h", "produces this help message" ) ( "version,v", "Print queryable version information from the clFFT library" ) ( "clinfo,i", "Print queryable information of all the OpenCL runtimes and devices" ) ( "printChosen", "Print queryable information of the selected OpenCL runtime and device" ) ( "gpu,g", "Force selection of OpenCL GPU devices only" ) ( "cpu,c", "Force selection of OpenCL CPU devices only" ) ( "all,a", "Force selection of all OpenCL devices (default)" ) ( "platform", po::value< cl_int >( &platformId )->default_value( 0 ), "Select a specific OpenCL platform id as it is reported by clinfo" ) ( "device", po::value< cl_int >( &deviceId )->default_value( 0 ), "Select a specific OpenCL device id as it is reported by clinfo" ) ( "outPlace,o", "Out of place FFT transform (default: in place)" ) ( "double", "Double precision transform (default: single)" ) ( "inv", "Backward transform (default: forward)" ) ( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" ) ( "lenX,x", po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ), "Specify the length of the 1st dimension of a test array" ) ( "lenY,y", po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ), "Specify the length of the 2nd dimension of a test array" ) ( "lenZ,z", po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ), "Specify the length of the 3rd dimension of a test array" ) ( "isX", po::value< size_t >( &iStrides[ 0 ] )->default_value( 1 ), "Specify the input stride of the 1st dimension of a test array" ) ( "isY", po::value< size_t >( &iStrides[ 1 ] )->default_value( 0 ), "Specify the input stride of the 2nd dimension of a test array" ) ( "isZ", po::value< size_t >( &iStrides[ 2 ] )->default_value( 0 ), "Specify the input stride of the 3rd dimension of a test array" ) ( "iD", po::value< size_t >( &iStrides[ 3 ] )->default_value( 0 ), "input distance between subsequent sets of data when batch size > 1" ) ( "osX", po::value< size_t >( &oStrides[ 0 ] )->default_value( 1 ), "Specify the output stride of the 1st dimension of a test array" ) ( "osY", po::value< size_t >( &oStrides[ 1 ] )->default_value( 0 ), "Specify the output stride of the 2nd dimension of a test array" ) ( "osZ", po::value< size_t >( &oStrides[ 2 ] )->default_value( 0 ), "Specify the output stride of the 3rd dimension of a test array" ) ( "oD", po::value< size_t >( &oStrides[ 3 ] )->default_value( 0 ), "output distance between subsequent sets of data when batch size > 1" ) ( "batchSize,b", po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " ) ( "profile,p", po::value< cl_uint >( &profile_count )->default_value( 1 ), "Time and report the kernel speed of the FFT (default: profiling off)" ) ( "inLayout", po::value< clfftLayout >( &inLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" ) ( "outLayout", po::value< clfftLayout >( &outLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" ) ; po::variables_map vm; po::store( po::parse_command_line( argc, argv, desc ), vm ); po::notify( vm ); if( vm.count( "version" ) ) { const int indent = countOf( "clFFT client API version: " ); tout << std::left << std::setw( indent ) << _T( "clFFT client API version: " ) << clfftVersionMajor << _T( "." ) << clfftVersionMinor << _T( "." ) << clfftVersionPatch << std::endl; cl_uint libMajor, libMinor, libPatch; clfftGetVersion( &libMajor, &libMinor, &libPatch ); tout << std::left << std::setw( indent ) << _T( "clFFT runtime version: " ) << libMajor << _T( "." ) << libMinor << _T( "." ) << libPatch << std::endl << std::endl; } if( vm.count( "help" ) ) { // This needs to be 'cout' as program-options does not support wcout yet std::cout << desc << std::endl; return 0; } size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) | ((vm.count( "cpu" ) > 0) ? 2 : 0) | ((vm.count( "all" ) > 0) ? 4 : 0); if ((mutex & (mutex-1)) != 0) { terr << _T("You have selected mutually-exclusive OpenCL device options:") << std::endl; if (vm.count ( "gpu" ) > 0) terr << _T(" gpu,g Force selection of OpenCL GPU devices only" ) << std::endl; if (vm.count ( "cpu" ) > 0) terr << _T(" cpu,c Force selection of OpenCL CPU devices only" ) << std::endl; if (vm.count ( "all" ) > 0) terr << _T(" all,a Force selection of all OpenCL devices (default)" ) << std::endl; return 1; } if( vm.count( "gpu" ) ) { deviceType = CL_DEVICE_TYPE_GPU; } if( vm.count( "cpu" ) ) { deviceType = CL_DEVICE_TYPE_CPU; } if( vm.count( "all" ) ) { deviceType = CL_DEVICE_TYPE_ALL; } if( vm.count( "clinfo" ) ) { std::vector< cl_platform_id > platformInfos; std::vector< std::vector< cl_device_id > > deviceInfos; discoverCLPlatforms( deviceType, platformInfos, deviceInfos ); prettyPrintCLPlatforms(platformInfos, deviceInfos); return 0; } bool printInfo = false; if( vm.count( "printChosen" ) ) { printInfo = true; } if( vm.count( "outPlace" ) ) { place = CLFFT_OUTOFPLACE; } if( vm.count( "double" ) ) { precision = CLFFT_DOUBLE; } if( vm.count( "inv" ) ) { dir = CLFFT_BACKWARD; } if( profile_count > 1 ) { command_queue_flags |= CL_QUEUE_PROFILING_ENABLE; } if( vm.count( "dumpKernels" ) ) { setupData->debugFlags |= CLFFT_DUMP_PROGRAMS; } int inL = (int)inLayout; int otL = (int)outLayout; // input output layout support matrix int ioLayoutSupport[5][5] = { { 1, 1, 0, 0, 1 }, { 1, 1, 0, 0, 1 }, { 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 1 }, { 1, 1, 1, 1, 0 }, }; if((inL < 1) || (inL > 5)) throw std::runtime_error( "Invalid Input layout format" ); if((otL < 1) || (otL > 5)) throw std::runtime_error( "Invalid Output layout format" ); if(ioLayoutSupport[inL-1][otL-1] == 0) throw std::runtime_error( "Invalid combination of Input/Output layout formats" ); if( ((inL == 1) || (inL == 2)) && ((otL == 1) || (otL == 2)) ) // Complex-Complex cases { iStrides[1] = iStrides[1] ? iStrides[1] : lengths[0] * iStrides[0]; iStrides[2] = iStrides[2] ? iStrides[2] : lengths[1] * iStrides[1]; iStrides[3] = iStrides[3] ? iStrides[3] : lengths[2] * iStrides[2]; if(place == CLFFT_INPLACE) { oStrides[0] = iStrides[0]; oStrides[1] = iStrides[1]; oStrides[2] = iStrides[2]; oStrides[3] = iStrides[3]; } else { oStrides[1] = oStrides[1] ? oStrides[1] : lengths[0] * oStrides[0]; oStrides[2] = oStrides[2] ? oStrides[2] : lengths[1] * oStrides[1]; oStrides[3] = oStrides[3] ? oStrides[3] : lengths[2] * oStrides[2]; } } else // Real-Complex and Complex-Real cases { size_t *rst, *cst; size_t N = lengths[0]; size_t Nt = 1 + lengths[0]/2; bool iflag = false; bool rcFull = (inL == 1) || (inL == 2) || (otL == 1) || (otL == 2); if(inLayout == CLFFT_REAL) { iflag = true; rst = iStrides; } else { rst = oStrides; } // either in or out should be REAL // Set either in or out strides whichever is real if(place == CLFFT_INPLACE) { if(rcFull) { rst[1] = rst[1] ? rst[1] : N * 2 * rst[0]; } else { rst[1] = rst[1] ? rst[1] : Nt * 2 * rst[0]; } rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1]; rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2]; } else { rst[1] = rst[1] ? rst[1] : lengths[0] * rst[0]; rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1]; rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2]; } // Set the remaining of in or out strides that is not real if(iflag) { cst = oStrides; } else { cst = iStrides; } if(rcFull) { cst[1] = cst[1] ? cst[1] : N * cst[0]; } else { cst[1] = cst[1] ? cst[1] : Nt * cst[0]; } cst[2] = cst[2] ? cst[2] : lengths[1] * cst[1]; cst[3] = cst[3] ? cst[3] : lengths[2] * cst[2]; } if( precision == CLFFT_SINGLE ) transform<float>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, printInfo, command_queue_flags, profile_count, setupData ); else transform<double>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, printInfo, command_queue_flags, profile_count, setupData ); } catch( std::exception& e ) { terr << _T( "clFFT error condition reported:" ) << std::endl << e.what() << std::endl; return 1; } return 0; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; float *X; cl_event event = NULL; int ret = 0; size_t N = 16; char platform_name[128]; char device_name[128]; /* FFT library realted declarations */ clfftPlanHandle planHandle; clfftDim dim = CLFFT_1D; size_t clLengths[1] = {N}; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); size_t ret_param_size = 0; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, &ret_param_size); printf("Platform found: %s\n", platform_name); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &ret_param_size); printf("Device found on the above platform: %s\n", device_name); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clFFT. */ clfftSetupData fftSetup; err = clfftInitSetupData(&fftSetup); err = clfftSetup(&fftSetup); /* Allocate host & initialize data. */ /* Only allocation shown for simplicity. */ X = (float *)malloc(N * 2 * sizeof(*X)); /* print input array */ printf("\nPerforming fft on an one dimensional array of size N = %ld\n", N); int print_iter = 0; while(print_iter<N) { float x = (float)print_iter; float y = (float)print_iter*3; X[2*print_iter ] = x; X[2*print_iter+1] = y; printf("(%f, %f) ", x, y); print_iter++; } printf("\n\nfft result: \n"); /* Prepare OpenCL memory objects and place data inside them. */ bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, N * 2 * sizeof(*X), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL ); /* Create a default plan for a complex FFT. */ err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths); /* Set plan parameters. */ err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); err = clfftSetResultLocation(planHandle, CLFFT_INPLACE); /* Bake the plan. */ err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL); /* Execute the plan. */ err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL); /* Wait for calculations to be finished. */ err = clFinish(queue); /* Fetch results of calculations. */ err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL ); /* print output array */ print_iter = 0; while(print_iter<N) { printf("(%f, %f) ", X[2*print_iter], X[2*print_iter+1]); print_iter++; } printf("\n"); /* Release OpenCL memory objects. */ clReleaseMemObject( bufX ); free(X); /* Release the plan. */ err = clfftDestroyPlan( &planHandle ); /* Release clFFT library. */ clfftTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }
int main(void) { //time meassuring struct timeval tvs; struct timeval tve; float elapsedTime; int Nx; int Ny; int Nz; int N; int plotnum=0; int Tmax=0; int plottime=0; int plotgap=0; float Lx,Ly,Lz; float dt=0.0; float A=0.0; float B=0.0; float Du=0.0; float Dv=0.0; float a[2]={1.0,0.0}; float b[2]={0.5,0.0}; float* x,*y,*z ; float* u[2],*v[2]; //openCL variables cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem cl_u[2] = {NULL,NULL}; cl_mem cl_v[2] = {NULL,NULL}; cl_mem cl_uhat[2] = {NULL,NULL}; cl_mem cl_vhat[2] = {NULL,NULL}; cl_mem cl_x = NULL; cl_mem cl_y = NULL; cl_mem cl_z = NULL; cl_mem cl_kx = NULL; cl_mem cl_ky = NULL; cl_mem cl_kz = NULL; cl_program p_grid = NULL,p_frequencies = NULL,p_initialdata = NULL,p_linearpart=NULL,p_nonlinearpart=NULL; cl_kernel grid = NULL,frequencies = NULL,initialdata = NULL,linearpart=NULL,nonlinearpart=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); command_queue = clCreateCommandQueue(context, device_id, 0, &ret); size_t source_size; char *source_str; //end opencl int i,n; int status=0; //int start, finish, count_rate, ind, numthreads char nameconfig[100]=""; //Read infutfile char InputFileName[]="./INPUTFILE"; FILE*fp; fp=fopen(InputFileName,"r"); if(!fp) {fprintf(stderr, "Failed to load IPUTFILE.\n");exit(1);} int ierr=fscanf(fp, "%d %d %d %d %d %f %f %f %f %f %f %f %f", &Nx,&Ny,&Nz,&Tmax,&plotgap,&Lx,&Ly,&Lz,&dt,&Du,&Dv,&A,&B); if(ierr!=13){fprintf(stderr, "INPUTFILE corrupted.\n");exit(1);} fclose(fp); printf("NX %d\n",Nx); printf("NY %d\n",Ny); printf("NZ %d\n",Nz); printf("Tmax %d\n",Tmax); printf("plotgap %d\n",plotgap); printf("Lx %f\n",Lx); printf("Ly %f\n",Ly); printf("Lz %f\n",Lz); printf("dt %f\n",dt); printf("Du %f\n",Du); printf("Dv %f\n",Dv); printf("F %f\n",A); printf("k %f\n",B); printf("Read inputfile\n"); N=Nx*Ny*Nz; plottime=plotgap; B=A+B; //ALLocate the memory u[0]=(float*) malloc(N*sizeof(float)); v[0]=(float*) malloc(N*sizeof(float)); x=(float*) malloc(Nx*sizeof(float)); y=(float*) malloc(Ny*sizeof(float)); z=(float*) malloc(Nz*sizeof(float)); //allocate gpu mem cl_u[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_u[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); printf("allocated space\n"); // FFT library realted declarations. clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {Nx, Ny, Nz}; // Setup clFFT. clfftSetupData fftSetup; ret = clfftInitSetupData(&fftSetup); ret = clfftSetup(&fftSetup); // Create a default plan for a complex FFT. ret = clfftCreateDefaultPlan(&planHandle, context, dim, clLengths); // Set plan parameters. ret = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); ret = clfftSetLayout(planHandle, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR); ret = clfftSetResultLocation(planHandle, CLFFT_OUTOFPLACE); // Bake the plan. ret = clfftBakePlan(planHandle, 1, &command_queue, NULL, NULL); // Create temporary buffer. cl_mem tmpBufferu = 0; cl_mem tmpBufferv = 0; // Size of temp buffer. size_t tmpBufferSize = 0; status = clfftGetTmpBufSize(planHandle, &tmpBufferSize); if ((status == 0) && (tmpBufferSize > 0)) { tmpBufferu = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); tmpBufferv = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); if (ret != CL_SUCCESS) printf("Error with tmpBuffer clCreateBuffer\n"); } //kernel grid fp = fopen("./grid.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load grid.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_grid = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_grid, 1, &device_id, NULL, NULL, NULL); grid = clCreateKernel(p_grid, "grid", &ret); //first x cl_x = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_x); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nx); size_t global_work_size_x[3] = {Nx, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_x, CL_TRUE, 0, Nx * sizeof(float), x, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_y = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_y); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Ny); size_t global_work_size_y[3] = {Ny, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_y, CL_TRUE, 0, Ny * sizeof(float), y, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_z = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_z); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nz); size_t global_work_size_z[3] = {Nz, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_z, CL_TRUE, 0, Nz * sizeof(float), z, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(grid); ret = clReleaseProgram(p_grid); //kernel initial data fp = fopen("./initialdata.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load initialdata.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_initialdata = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_initialdata, 1, &device_id, NULL, NULL, NULL); initialdata = clCreateKernel(p_initialdata, "initialdata", &ret); ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(initialdata, 2, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(initialdata, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(initialdata, 4, sizeof(cl_mem),(void* )&cl_x); ret = clSetKernelArg(initialdata, 5, sizeof(cl_mem),(void* )&cl_y); ret = clSetKernelArg(initialdata, 6, sizeof(cl_mem),(void* )&cl_z); ret = clSetKernelArg(initialdata, 7, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 8, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 9, sizeof(int),(void* )&Nz); size_t global_work_size[3] = {N, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(initialdata); ret = clReleaseProgram(p_initialdata); ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseMemObject(cl_x); ret = clReleaseMemObject(cl_y); ret = clReleaseMemObject(cl_z); //write to disk fp=fopen("./data/xcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write xcoord.dat.\n"); exit(1); } for(i=0;i<Nx;i++){fprintf(fp,"%f\n",x[i]);} fclose( fp ); fp=fopen("./data/ycoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write ycoord.dat.\n"); exit(1); } for(i=0;i<Ny;i++){fprintf(fp,"%f\n",y[i]);} fclose( fp ); fp=fopen("./data/zcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write zcoord.dat.\n"); exit(1); } for(i=0;i<Nz;i++){fprintf(fp,"%f\n",z[i]);} fclose( fp ); free(x); free(y); free(z); n=0; plotnum=0; //output of initial data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); //frequencies kernel fp = fopen("./frequencies.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load frequencies.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_frequencies = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_frequencies, 1, &device_id, NULL, NULL, NULL); frequencies = clCreateKernel(p_frequencies, "frequencies", &ret); //get frequencies first x cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_kz = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kz); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nz); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); printf("Setup grid, fourier frequencies and initialcondition\n"); //load the rest of the kernels //linearpart kernel fp = fopen("./linearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load linearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_linearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_linearpart, 1, &device_id, NULL, NULL, NULL); linearpart = clCreateKernel(p_linearpart, "linearpart", &ret); //kernel nonlinear fp = fopen("./nonlinearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load nonlinearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_nonlinearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_nonlinearpart, 1, &device_id, NULL, NULL, NULL); nonlinearpart = clCreateKernel(p_nonlinearpart, "nonlinearpart", &ret); printf("Got initial data, starting timestepping\n"); gettimeofday(&tvs, NULL); for(n=0;n<=Tmax;n++){ //linear ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); //nonlinearpart ret = clSetKernelArg(nonlinearpart, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(nonlinearpart, 1, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(nonlinearpart, 2, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(nonlinearpart, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(nonlinearpart, 4, sizeof(float),(void* )&dt); ret = clSetKernelArg(nonlinearpart, 5, sizeof(float),(void* )&a[0]); ret = clSetKernelArg(nonlinearpart, 6, sizeof(float),(void* )&a[1]); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // linear part ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d\n",n*dt,n,plotnum); plottime=plottime+plotgap; plotnum=plotnum+1; ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); //output of data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write u-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write v-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); } } gettimeofday(&tve, NULL); printf("Finished time stepping\n"); elapsedTime = (tve.tv_sec - tvs.tv_sec) * 1000.0; // sec to ms elapsedTime += (tve.tv_usec - tvs.tv_usec) / 1000.0; // us to ms printf("%f,",elapsedTime); clReleaseMemObject(cl_u[0]); clReleaseMemObject(cl_u[1]); clReleaseMemObject(cl_v[0]); clReleaseMemObject(cl_v[1]); clReleaseMemObject(cl_uhat[0]); clReleaseMemObject(cl_uhat[1]); clReleaseMemObject(cl_vhat[0]); clReleaseMemObject(cl_vhat[1]); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); clReleaseMemObject(cl_kz); ret = clReleaseKernel(frequencies); ret = clReleaseProgram(p_frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseProgram(p_linearpart); ret = clReleaseKernel(nonlinearpart); ret = clReleaseProgram(p_nonlinearpart); free(u[0]); free(v[0]); clReleaseMemObject(tmpBufferu); clReleaseMemObject(tmpBufferv); /* Release the plan. */ ret = clfftDestroyPlan(&planHandle); /* Release clFFT library. */ clfftTeardown(); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); printf("Program execution complete\n"); return 0; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; float *X; cl_event event = NULL; int ret = 0; const size_t N0 = 4, N1 = 4, N2 = 4; char platform_name[128]; char device_name[128]; /* FFT library realted declarations */ clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {N0, N1, N2}; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); size_t ret_param_size = 0; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, &ret_param_size); printf("Platform found: %s\n", platform_name); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &ret_param_size); printf("Device found on the above platform: %s\n", device_name); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clFFT. */ clfftSetupData fftSetup; err = clfftInitSetupData(&fftSetup); err = clfftSetup(&fftSetup); /* Allocate host & initialize data. */ /* Only allocation shown for simplicity. */ size_t buffer_size = N0 * N1 * N2 * 2 * sizeof(*X); X = (float *)malloc(buffer_size); /* print input array just using the * indices to fill the array with data */ printf("\nPerforming fft on an three dimensional array of size N0 x N1 x N2 : %ld x %ld x %ld\n", N0, N1, N2); int i, j, k; i = j = k = 0; for (i=0; i<N0; ++i) { for (j=0; j<N1; ++j) { for (k=0; k<N2; ++k) { float x = 0.0f; float y = 0.0f; if (i==0 && j==0 && k==0) { x = y = 0.5f; } unsigned idx = 2*(k+j*N1+i*N0*N1); X[idx] = x; X[idx+1] = y; printf("(%f, %f) ", X[idx], X[idx+1]); } printf("\n"); } printf("\n"); } /* Prepare OpenCL memory objects and place data inside them. */ bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, buffer_size, NULL, &err ); err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL ); /* Create a default plan for a complex FFT. */ err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths); /* Set plan parameters. */ err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); err = clfftSetResultLocation(planHandle, CLFFT_INPLACE); /* Bake the plan. */ err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL); /* Execute the plan. */ err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL); /* Wait for calculations to be finished. */ err = clFinish(queue); /* Fetch results of calculations. */ err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL ); /* print output array */ printf("\n\nfft result: \n"); i = j = k = 0; for (i=0; i<N0; ++i) { for (j=0; j<N1; ++j) { for (k=0; k<N2; ++k) { unsigned idx = 2*(k+j*N1+i*N0*N1); printf("(%f, %f) ", X[idx], X[idx+1]); } printf("\n"); } printf("\n"); } printf("\n"); /* Release OpenCL memory objects. */ clReleaseMemObject( bufX ); free(X); /* Release the plan. */ err = clfftDestroyPlan( &planHandle ); /* Release clFFT library. */ clfftTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }
fft_api() { clfftSetupData setupData; CLFFT_CHECK(clfftInitSetupData(&setupData)); CLFFT_CHECK(clfftSetup(&setupData)); }