ifft2d_hermitian_inplace::ifft2d_hermitian_inplace( gpu::compute::command_queue queue, const math::ivec2 &size, size_t num_batches) { static detail::fft_api fft_api; size_t N = size.x; size_t M = size.y; size_t lenghts[] = { N, M }; size_t in_stride[] = { 1, N / 2 + 1 }; size_t out_stride[] = { 1, N + 2 }; auto context = queue.getInfo<CL_QUEUE_CONTEXT>(); CLFFT_CHECK(clfftCreateDefaultPlan(&fft_plan, context(), CLFFT_2D, lenghts)); CLFFT_CHECK(clfftSetPlanBatchSize(fft_plan, num_batches)); CLFFT_CHECK(clfftSetPlanPrecision(fft_plan, detail::clfft_precision<math::real>::value)); CLFFT_CHECK(clfftSetResultLocation(fft_plan, CLFFT_INPLACE)); CLFFT_CHECK(clfftSetLayout(fft_plan, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL)); CLFFT_CHECK(clfftSetPlanInStride(fft_plan, CLFFT_2D, in_stride)); CLFFT_CHECK(clfftSetPlanOutStride(fft_plan, CLFFT_2D, out_stride)); CLFFT_CHECK(clfftSetPlanScale(fft_plan, CLFFT_BACKWARD, cl_float(1))); CLFFT_CHECK(clfftSetPlanDistance(fft_plan, M * (N / 2 + 1), M * (N + 2))); CLFFT_CHECK(clfftBakePlan(fft_plan, 1, &queue(), nullptr, nullptr)); size_t tmp_sz; CLFFT_CHECK(clfftGetTmpBufSize(fft_plan, &tmp_sz)); tmp_buf = gpu::compute::buffer(context, CL_MEM_READ_WRITE, tmp_sz); }
JNIEXPORT jlong JNICALL Java_ffx_numerics_fft_Complex3DOpenCL_createDefaultPlanNative (JNIEnv *env, jclass object, jlong jContext, jint dimension, jint dimX, jint dimY, jint dimZ) { clfftStatus_ err; clfftDim dim; clfftPlanHandle planHandle; size_t clLengths[(int) dimension]; size_t clStrides[(int) dimension]; cl_float scale = 1.0; cl_context context = (cl_context) jContext; switch ((int) dimension) { case 3: dim = CLFFT_3D; clLengths[0] = (size_t) dimX; clLengths[1] = (size_t) dimY; clLengths[2] = (size_t) dimZ; clStrides[0] = (size_t) 1; clStrides[1] = (size_t) dimX; clStrides[2] = (size_t) dimX * dimY; break; case 2: dim = CLFFT_2D; clLengths[0] = (size_t) dimX; clLengths[1] = (size_t) dimY; clStrides[0] = (size_t) 1; clStrides[1] = (size_t) dimX; break; case 1: default: dim = CLFFT_1D; clLengths[0] = (size_t) dimX; clStrides[0] = (size_t) 1; break; } err = clfftCreateDefaultPlan(&planHandle, context, dim, clLengths); //printf(" Lengths %zd %d\n", planHandle, err); err = clfftSetPlanInStride(planHandle, dim, clStrides); //printf(" In Strides %zd %d\n", planHandle, err); err = clfftSetPlanOutStride(planHandle, dim, clStrides); //printf(" Out Strides %zd %d\n", planHandle, err); err = clfftSetPlanScale(planHandle, CLFFT_FORWARD, scale); //printf(" Forward Scale %zd %d\n", planHandle, err); err = clfftSetPlanScale(planHandle, CLFFT_BACKWARD, scale); //printf(" Backward Scale %zd %d\n", planHandle, err); err = clfftSetPlanPrecision(planHandle, CLFFT_DOUBLE); //printf(" Precision %zd %d\n", planHandle, err); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); //printf(" Layout %zd %d\n", planHandle, err); return ((jlong) planHandle); }
cl_int ufo_fft_update (UfoFft *fft, cl_context context, cl_command_queue queue, UfoFftParameter *param) { gboolean changed; cl_int error; error = CL_SUCCESS; changed = param->size[0] != fft->seen.size[0] || param->size[1] != fft->seen.size[1]; if (changed) memcpy (&fft->seen, param, sizeof (UfoFftParameter)); #ifdef HAVE_AMD if (fft->amd_plan == 0 || changed) { /* we use param->dimension to index into this array! */ clfftDim dimension[4] = { 0, CLFFT_1D, CLFFT_2D, CLFFT_3D }; if (fft->amd_plan != 0) { clfftDestroyPlan (&fft->amd_plan); fft->amd_plan = 0; } UFO_RESOURCES_CHECK_CLERR (clfftCreateDefaultPlan (&fft->amd_plan, context, dimension[param->dimensions], param->size)); UFO_RESOURCES_CHECK_CLERR (clfftSetPlanBatchSize (fft->amd_plan, param->batch)); UFO_RESOURCES_CHECK_CLERR (clfftSetPlanPrecision (fft->amd_plan, CLFFT_SINGLE)); UFO_RESOURCES_CHECK_CLERR (clfftSetLayout (fft->amd_plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED)); UFO_RESOURCES_CHECK_CLERR (clfftSetResultLocation (fft->amd_plan, param->zeropad ? CLFFT_INPLACE : CLFFT_OUTOFPLACE)); UFO_RESOURCES_CHECK_CLERR (clfftBakePlan (fft->amd_plan, 1, &queue, NULL, NULL)); } #else if (fft->apple_plan == NULL || changed) { clFFT_Dim3 size; /* we use param->dimension to index into this array! */ clFFT_Dimension dimension[4] = { 0, clFFT_1D, clFFT_2D, clFFT_3D }; size.x = param->size[0]; size.y = param->size[1]; size.z = param->size[2]; if (fft->apple_plan != NULL) { clFFT_DestroyPlan (fft->apple_plan); fft->apple_plan = NULL; } fft->apple_plan = clFFT_CreatePlan (context, size, dimension[param->dimensions], clFFT_InterleavedComplexFormat, &error); } #endif return error; }
void resize(::size_t n) { if (n == _in.size() && n == _out.size()) return; _in.resize(n); _out.resize(n); ::size_t clLengths[1] = { n }; ASSERT_THROW_CL(clfftCreateDefaultPlan(&_plan_handle, _queue.getInfo<CL_QUEUE_CONTEXT>()(), CLFFT_1D, clLengths)); ASSERT_THROW_CL(clfftSetPlanPrecision ( _plan_handle, CLFFT_SINGLE)); ASSERT_THROW_CL(clfftSetPlanScale ( _plan_handle, _direction, 1.0f)); ASSERT_THROW_CL(clfftSetLayout ( _plan_handle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED)); ASSERT_THROW_CL(clfftSetResultLocation( _plan_handle, CLFFT_OUTOFPLACE)); ASSERT_THROW_CL(clfftBakePlan ( _plan_handle, 1, &_queue(), NULL, NULL)); }
void FC_FUNC_(clfftcreatedefaultplan_low, CLFFTCREATEDEFAULTPLAN_LOW) (clfftPlanHandle ** plHandle, cl_context * context, const int * dim, const cl_long * clLengths, int * status){ size_t * lengths_size_t; int i; lengths_size_t = (size_t *) malloc(sizeof(size_t)*(*dim)); for(i = 0; i < *dim; i++){ lengths_size_t[i] = (size_t) clLengths[i]; /* printf("%d %d\n", clLengths[i], lengths_size_t[i]);*/ } *plHandle = (clfftPlanHandle *) malloc(sizeof(clfftPlanHandle)); *status = clfftCreateDefaultPlan(*plHandle, *context, *dim, lengths_size_t); free(lengths_size_t); }
int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size, clfftLayout in_layout, clfftLayout out_layout, clfftResultLocation place, clfftPrecision precision, clfftDirection dir, cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo, cl_uint command_queue_flags, cl_uint profile_count, std::auto_ptr< clfftSetupData > setupData ) { // Our command line does not specify what dimension FFT we wish to transform; we decode // this from the lengths that the user specifies for X, Y, Z. A length of one means that // The user does not want that dimension. const size_t max_dimensions = 3; size_t strides[ 4 ]; size_t o_strides[ 4 ]; size_t fftVectorSize = 0; size_t fftVectorSizePadded = 0; size_t fftBatchSize = 0; size_t outfftVectorSize = 0; size_t outfftVectorSizePadded = 0; size_t outfftBatchSize = 0; size_t size_of_input_buffers_in_bytes = 0; size_t size_of_output_buffers_in_bytes = 0; cl_uint number_of_output_buffers = 0; clfftDim dim = CLFFT_1D; cl_mem input_cl_mem_buffers [2] = { NULL, NULL }; cl_mem output_cl_mem_buffers[2] = { NULL, NULL }; std::vector< cl_device_id > device_id; cl_context context; cl_command_queue queue; cl_event outEvent = NULL; clfftPlanHandle plan_handle; for (unsigned u = 0; u < max_dimensions; ++u) { if (0 != lengths[u]) continue; lengths[u] = 1; } if( lengths[ 1 ] > 1 ) { dim = CLFFT_2D; } if( lengths[ 2 ] > 1 ) { dim = CLFFT_3D; } strides[ 0 ] = inStrides[0]; strides[ 1 ] = inStrides[1]; strides[ 2 ] = inStrides[2]; strides[ 3 ] = inStrides[3]; o_strides[ 0 ] = outStrides[0]; o_strides[ 1 ] = outStrides[1]; o_strides[ 2 ] = outStrides[2]; o_strides[ 3 ] = outStrides[3]; fftVectorSize = lengths[0] * lengths[1] * lengths[2]; fftVectorSizePadded = strides[3]; fftBatchSize = fftVectorSizePadded * batch_size; size_t Nt = 1 + lengths[0]/2; if(place == CLFFT_INPLACE) { outfftVectorSize = fftVectorSize; outfftVectorSizePadded = fftVectorSizePadded; outfftBatchSize = fftBatchSize; } else { outfftVectorSize = lengths[0] * lengths[1] * lengths[2]; outfftVectorSizePadded = o_strides[3]; outfftBatchSize = outfftVectorSizePadded * batch_size; } // Real to complex case if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) ) { fftVectorSizePadded = strides[3]; fftBatchSize = fftVectorSizePadded * batch_size; outfftVectorSizePadded = o_strides[3]; outfftBatchSize = outfftVectorSizePadded * batch_size; fftVectorSize = lengths[0] * lengths[1] * lengths[2]; outfftVectorSize = fftVectorSize; } switch( out_layout ) { case CLFFT_COMPLEX_INTERLEAVED: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > ); break; case CLFFT_COMPLEX_PLANAR: number_of_output_buffers = 2; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; case CLFFT_HERMITIAN_INTERLEAVED: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > ); break; case CLFFT_HERMITIAN_PLANAR: number_of_output_buffers = 2; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; case CLFFT_REAL: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; } // Fill the input buffers switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< std::complex< T > > input( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { input[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; input[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_COMPLEX_PLANAR: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); std::vector< T > imag( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; imag[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; real[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_HERMITIAN_INTERLEAVED: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< std::complex< T > > input( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { input[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; input[p3] = static_cast<T>(outfftVectorSize); } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_HERMITIAN_PLANAR: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); std::vector< T > imag( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; imag[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; real[p3] = static_cast<T>(outfftVectorSize); } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_REAL: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; real[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; default: { throw std::runtime_error( "Input layout format not yet supported" ); } break; } // Discover and load the timer module if present void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false ); if( timerLibHandle == NULL ) { terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl; } // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) ); // Create and initialize our timer class, if the external timer shared library loaded baseStatTimer* timer = NULL; size_t clFFTID = 0; if( get_timer ) { timer = get_timer( CLFFT_GPU ); timer->Reserve( 1, profile_count ); timer->setNormalize( true ); clFFTID = timer->getUniqueID( "clFFT", 0 ); } OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" ); OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" ); // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" ); OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" ); OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" ); OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" ); OPENCL_V_THROW (clfftSetPlanInStride ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" ); OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" ); OPENCL_V_THROW (clfftSetPlanDistance ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" ); // Set backward scale factor to 1.0 for non real FFTs to do correct output checks if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL) OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" ); OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &queue, NULL, NULL ), "clfftBakePlan failed" ); //get the buffersize size_t buffersize=0; OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" ); //allocate the intermediate buffer cl_mem clMedBuffer=NULL; if (buffersize) { cl_int medstatus; clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus); OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" ); } switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: case CLFFT_COMPLEX_PLANAR: case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_HERMITIAN_PLANAR: case CLFFT_REAL: break; default: // Don't recognize input layout return CLFFT_INVALID_ARG_VALUE; } switch( out_layout ) { case CLFFT_COMPLEX_INTERLEAVED: case CLFFT_COMPLEX_PLANAR: case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_HERMITIAN_PLANAR: case CLFFT_REAL: break; default: // Don't recognize output layout return CLFFT_INVALID_ARG_VALUE; } if (( place == CLFFT_INPLACE ) && ( in_layout != out_layout )) { switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: { if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } case CLFFT_COMPLEX_PLANAR: { if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) ) { throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); } break; } case CLFFT_HERMITIAN_INTERLEAVED: { if( out_layout != CLFFT_REAL ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } case CLFFT_HERMITIAN_PLANAR: { throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); break; } case CLFFT_REAL: { if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } } } // Loop as many times as the user specifies to average out the timings // cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ]; Timer tr; tr.Start(); for( cl_uint i = 0; i < profile_count; ++i ) { if( timer ) timer->Start( clFFTID ); OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent, &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), "clfftEnqueueTransform failed" ); if( timer ) timer->Stop( clFFTID ); } OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); if(clMedBuffer) clReleaseMemObject(clMedBuffer); double wtime = tr.Sample()/((double)profile_count); size_t totalLen = 1; for(int i=0; i<dim; i++) totalLen *= lengths[i]; double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0); if(profile_count > 1) { tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl; tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl; } if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) ) { // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result timer->pruneOutliers( 2.0 ); timer->Print( ); timer->Reset( ); } /*****************/ FreeSharedLibrary( timerLibHandle ); // Read and check output data // This check is not valid if the FFT is executed multiple times inplace. // if (( place == CLFFT_OUTOFPLACE ) || ( profile_count == 1)) { bool checkflag= false; switch( out_layout ) { case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_COMPLEX_INTERLEAVED: { std::vector< std::complex< T > > output( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } //check output data for( cl_uint i = 0; i < outfftBatchSize; ++i ) { if (0 == (i % outfftVectorSizePadded)) { if (output[i].real() != outfftVectorSize) { checkflag = true; break; } } else { if (output[ i ].real() != 0) { checkflag = true; break; } } if (output[ i ].imag() != 0) { checkflag = true; break; } } } break; case CLFFT_HERMITIAN_PLANAR: case CLFFT_COMPLEX_PLANAR: { std::valarray< T > real( outfftBatchSize ); std::valarray< T > imag( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } // Check output data for( cl_uint i = 0; i < outfftBatchSize; ++i ) { if (0 == (i % outfftVectorSizePadded)) { if (real[i] != outfftVectorSize) { checkflag = true; break; } } else { if (real[i] != 0) { checkflag = true; break; } } if (imag[i] != 0) { checkflag = true; break; } } } break; case CLFFT_REAL: { std::valarray< T > real( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } ////check output data for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * o_strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * o_strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * o_strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * o_strides[0]; if (real[p0] != 1) { checkflag = true; break; } } } } } } break; default: { throw std::runtime_error( "Input layout format not yet supported" ); } break; } if (checkflag) { std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl; } else { std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl; } } OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" ); OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" ); cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent ); return 0; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; float *X; cl_event event = NULL; int ret = 0; size_t N = 16; char platform_name[128]; char device_name[128]; /* FFT library realted declarations */ clfftPlanHandle planHandle; clfftDim dim = CLFFT_1D; size_t clLengths[1] = {N}; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); size_t ret_param_size = 0; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, &ret_param_size); printf("Platform found: %s\n", platform_name); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &ret_param_size); printf("Device found on the above platform: %s\n", device_name); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clFFT. */ clfftSetupData fftSetup; err = clfftInitSetupData(&fftSetup); err = clfftSetup(&fftSetup); /* Allocate host & initialize data. */ /* Only allocation shown for simplicity. */ X = (float *)malloc(N * 2 * sizeof(*X)); /* print input array */ printf("\nPerforming fft on an one dimensional array of size N = %ld\n", N); int print_iter = 0; while(print_iter<N) { float x = (float)print_iter; float y = (float)print_iter*3; X[2*print_iter ] = x; X[2*print_iter+1] = y; printf("(%f, %f) ", x, y); print_iter++; } printf("\n\nfft result: \n"); /* Prepare OpenCL memory objects and place data inside them. */ bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, N * 2 * sizeof(*X), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL ); /* Create a default plan for a complex FFT. */ err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths); /* Set plan parameters. */ err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); err = clfftSetResultLocation(planHandle, CLFFT_INPLACE); /* Bake the plan. */ err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL); /* Execute the plan. */ err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL); /* Wait for calculations to be finished. */ err = clFinish(queue); /* Fetch results of calculations. */ err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL ); /* print output array */ print_iter = 0; while(print_iter<N) { printf("(%f, %f) ", X[2*print_iter], X[2*print_iter+1]); print_iter++; } printf("\n"); /* Release OpenCL memory objects. */ clReleaseMemObject( bufX ); free(X); /* Release the plan. */ err = clfftDestroyPlan( &planHandle ); /* Release clFFT library. */ clfftTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }
int main(void) { //time meassuring struct timeval tvs; struct timeval tve; float elapsedTime; int Nx; int Ny; int Nz; int N; int plotnum=0; int Tmax=0; int plottime=0; int plotgap=0; float Lx,Ly,Lz; float dt=0.0; float A=0.0; float B=0.0; float Du=0.0; float Dv=0.0; float a[2]={1.0,0.0}; float b[2]={0.5,0.0}; float* x,*y,*z ; float* u[2],*v[2]; //openCL variables cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem cl_u[2] = {NULL,NULL}; cl_mem cl_v[2] = {NULL,NULL}; cl_mem cl_uhat[2] = {NULL,NULL}; cl_mem cl_vhat[2] = {NULL,NULL}; cl_mem cl_x = NULL; cl_mem cl_y = NULL; cl_mem cl_z = NULL; cl_mem cl_kx = NULL; cl_mem cl_ky = NULL; cl_mem cl_kz = NULL; cl_program p_grid = NULL,p_frequencies = NULL,p_initialdata = NULL,p_linearpart=NULL,p_nonlinearpart=NULL; cl_kernel grid = NULL,frequencies = NULL,initialdata = NULL,linearpart=NULL,nonlinearpart=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); command_queue = clCreateCommandQueue(context, device_id, 0, &ret); size_t source_size; char *source_str; //end opencl int i,n; int status=0; //int start, finish, count_rate, ind, numthreads char nameconfig[100]=""; //Read infutfile char InputFileName[]="./INPUTFILE"; FILE*fp; fp=fopen(InputFileName,"r"); if(!fp) {fprintf(stderr, "Failed to load IPUTFILE.\n");exit(1);} int ierr=fscanf(fp, "%d %d %d %d %d %f %f %f %f %f %f %f %f", &Nx,&Ny,&Nz,&Tmax,&plotgap,&Lx,&Ly,&Lz,&dt,&Du,&Dv,&A,&B); if(ierr!=13){fprintf(stderr, "INPUTFILE corrupted.\n");exit(1);} fclose(fp); printf("NX %d\n",Nx); printf("NY %d\n",Ny); printf("NZ %d\n",Nz); printf("Tmax %d\n",Tmax); printf("plotgap %d\n",plotgap); printf("Lx %f\n",Lx); printf("Ly %f\n",Ly); printf("Lz %f\n",Lz); printf("dt %f\n",dt); printf("Du %f\n",Du); printf("Dv %f\n",Dv); printf("F %f\n",A); printf("k %f\n",B); printf("Read inputfile\n"); N=Nx*Ny*Nz; plottime=plotgap; B=A+B; //ALLocate the memory u[0]=(float*) malloc(N*sizeof(float)); v[0]=(float*) malloc(N*sizeof(float)); x=(float*) malloc(Nx*sizeof(float)); y=(float*) malloc(Ny*sizeof(float)); z=(float*) malloc(Nz*sizeof(float)); //allocate gpu mem cl_u[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_u[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); printf("allocated space\n"); // FFT library realted declarations. clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {Nx, Ny, Nz}; // Setup clFFT. clfftSetupData fftSetup; ret = clfftInitSetupData(&fftSetup); ret = clfftSetup(&fftSetup); // Create a default plan for a complex FFT. ret = clfftCreateDefaultPlan(&planHandle, context, dim, clLengths); // Set plan parameters. ret = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); ret = clfftSetLayout(planHandle, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR); ret = clfftSetResultLocation(planHandle, CLFFT_OUTOFPLACE); // Bake the plan. ret = clfftBakePlan(planHandle, 1, &command_queue, NULL, NULL); // Create temporary buffer. cl_mem tmpBufferu = 0; cl_mem tmpBufferv = 0; // Size of temp buffer. size_t tmpBufferSize = 0; status = clfftGetTmpBufSize(planHandle, &tmpBufferSize); if ((status == 0) && (tmpBufferSize > 0)) { tmpBufferu = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); tmpBufferv = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); if (ret != CL_SUCCESS) printf("Error with tmpBuffer clCreateBuffer\n"); } //kernel grid fp = fopen("./grid.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load grid.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_grid = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_grid, 1, &device_id, NULL, NULL, NULL); grid = clCreateKernel(p_grid, "grid", &ret); //first x cl_x = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_x); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nx); size_t global_work_size_x[3] = {Nx, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_x, CL_TRUE, 0, Nx * sizeof(float), x, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_y = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_y); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Ny); size_t global_work_size_y[3] = {Ny, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_y, CL_TRUE, 0, Ny * sizeof(float), y, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_z = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_z); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nz); size_t global_work_size_z[3] = {Nz, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_z, CL_TRUE, 0, Nz * sizeof(float), z, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(grid); ret = clReleaseProgram(p_grid); //kernel initial data fp = fopen("./initialdata.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load initialdata.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_initialdata = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_initialdata, 1, &device_id, NULL, NULL, NULL); initialdata = clCreateKernel(p_initialdata, "initialdata", &ret); ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(initialdata, 2, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(initialdata, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(initialdata, 4, sizeof(cl_mem),(void* )&cl_x); ret = clSetKernelArg(initialdata, 5, sizeof(cl_mem),(void* )&cl_y); ret = clSetKernelArg(initialdata, 6, sizeof(cl_mem),(void* )&cl_z); ret = clSetKernelArg(initialdata, 7, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 8, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 9, sizeof(int),(void* )&Nz); size_t global_work_size[3] = {N, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(initialdata); ret = clReleaseProgram(p_initialdata); ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseMemObject(cl_x); ret = clReleaseMemObject(cl_y); ret = clReleaseMemObject(cl_z); //write to disk fp=fopen("./data/xcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write xcoord.dat.\n"); exit(1); } for(i=0;i<Nx;i++){fprintf(fp,"%f\n",x[i]);} fclose( fp ); fp=fopen("./data/ycoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write ycoord.dat.\n"); exit(1); } for(i=0;i<Ny;i++){fprintf(fp,"%f\n",y[i]);} fclose( fp ); fp=fopen("./data/zcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write zcoord.dat.\n"); exit(1); } for(i=0;i<Nz;i++){fprintf(fp,"%f\n",z[i]);} fclose( fp ); free(x); free(y); free(z); n=0; plotnum=0; //output of initial data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); //frequencies kernel fp = fopen("./frequencies.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load frequencies.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_frequencies = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_frequencies, 1, &device_id, NULL, NULL, NULL); frequencies = clCreateKernel(p_frequencies, "frequencies", &ret); //get frequencies first x cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_kz = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kz); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nz); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); printf("Setup grid, fourier frequencies and initialcondition\n"); //load the rest of the kernels //linearpart kernel fp = fopen("./linearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load linearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_linearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_linearpart, 1, &device_id, NULL, NULL, NULL); linearpart = clCreateKernel(p_linearpart, "linearpart", &ret); //kernel nonlinear fp = fopen("./nonlinearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load nonlinearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_nonlinearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_nonlinearpart, 1, &device_id, NULL, NULL, NULL); nonlinearpart = clCreateKernel(p_nonlinearpart, "nonlinearpart", &ret); printf("Got initial data, starting timestepping\n"); gettimeofday(&tvs, NULL); for(n=0;n<=Tmax;n++){ //linear ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); //nonlinearpart ret = clSetKernelArg(nonlinearpart, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(nonlinearpart, 1, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(nonlinearpart, 2, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(nonlinearpart, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(nonlinearpart, 4, sizeof(float),(void* )&dt); ret = clSetKernelArg(nonlinearpart, 5, sizeof(float),(void* )&a[0]); ret = clSetKernelArg(nonlinearpart, 6, sizeof(float),(void* )&a[1]); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // linear part ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d\n",n*dt,n,plotnum); plottime=plottime+plotgap; plotnum=plotnum+1; ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); //output of data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write u-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write v-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); } } gettimeofday(&tve, NULL); printf("Finished time stepping\n"); elapsedTime = (tve.tv_sec - tvs.tv_sec) * 1000.0; // sec to ms elapsedTime += (tve.tv_usec - tvs.tv_usec) / 1000.0; // us to ms printf("%f,",elapsedTime); clReleaseMemObject(cl_u[0]); clReleaseMemObject(cl_u[1]); clReleaseMemObject(cl_v[0]); clReleaseMemObject(cl_v[1]); clReleaseMemObject(cl_uhat[0]); clReleaseMemObject(cl_uhat[1]); clReleaseMemObject(cl_vhat[0]); clReleaseMemObject(cl_vhat[1]); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); clReleaseMemObject(cl_kz); ret = clReleaseKernel(frequencies); ret = clReleaseProgram(p_frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseProgram(p_linearpart); ret = clReleaseKernel(nonlinearpart); ret = clReleaseProgram(p_nonlinearpart); free(u[0]); free(v[0]); clReleaseMemObject(tmpBufferu); clReleaseMemObject(tmpBufferv); /* Release the plan. */ ret = clfftDestroyPlan(&planHandle); /* Release clFFT library. */ clfftTeardown(); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); printf("Program execution complete\n"); return 0; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; float *X; cl_event event = NULL; int ret = 0; const size_t N0 = 4, N1 = 4, N2 = 4; char platform_name[128]; char device_name[128]; /* FFT library realted declarations */ clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {N0, N1, N2}; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); size_t ret_param_size = 0; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, &ret_param_size); printf("Platform found: %s\n", platform_name); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &ret_param_size); printf("Device found on the above platform: %s\n", device_name); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clFFT. */ clfftSetupData fftSetup; err = clfftInitSetupData(&fftSetup); err = clfftSetup(&fftSetup); /* Allocate host & initialize data. */ /* Only allocation shown for simplicity. */ size_t buffer_size = N0 * N1 * N2 * 2 * sizeof(*X); X = (float *)malloc(buffer_size); /* print input array just using the * indices to fill the array with data */ printf("\nPerforming fft on an three dimensional array of size N0 x N1 x N2 : %ld x %ld x %ld\n", N0, N1, N2); int i, j, k; i = j = k = 0; for (i=0; i<N0; ++i) { for (j=0; j<N1; ++j) { for (k=0; k<N2; ++k) { float x = 0.0f; float y = 0.0f; if (i==0 && j==0 && k==0) { x = y = 0.5f; } unsigned idx = 2*(k+j*N1+i*N0*N1); X[idx] = x; X[idx+1] = y; printf("(%f, %f) ", X[idx], X[idx+1]); } printf("\n"); } printf("\n"); } /* Prepare OpenCL memory objects and place data inside them. */ bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, buffer_size, NULL, &err ); err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL ); /* Create a default plan for a complex FFT. */ err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths); /* Set plan parameters. */ err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); err = clfftSetResultLocation(planHandle, CLFFT_INPLACE); /* Bake the plan. */ err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL); /* Execute the plan. */ err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL); /* Wait for calculations to be finished. */ err = clFinish(queue); /* Fetch results of calculations. */ err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL ); /* print output array */ printf("\n\nfft result: \n"); i = j = k = 0; for (i=0; i<N0; ++i) { for (j=0; j<N1; ++j) { for (k=0; k<N2; ++k) { unsigned idx = 2*(k+j*N1+i*N0*N1); printf("(%f, %f) ", X[idx], X[idx+1]); } printf("\n"); } printf("\n"); } printf("\n"); /* Release OpenCL memory objects. */ clReleaseMemObject( bufX ); free(X); /* Release the plan. */ err = clfftDestroyPlan( &planHandle ); /* Release clFFT library. */ clfftTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }