void ClFFT::perform( cl::Buffer real, cl::Buffer im, clFFT_Direction& dir ) { size_t requiredSize = m_size.getArea()*sizeof(float); size_t realBytes = real.getInfo<CL_MEM_SIZE>(); size_t imBytes = real.getInfo<CL_MEM_SIZE>(); assert(realBytes >= requiredSize); assert(imBytes >= requiredSize); try { cl_int err = clFFT_ExecutePlannar((*m_queue)(), *m_fftPlan, 1, dir, real(), im(), real(), im(), 0, NULL, NULL); if(!*m_fftPlan || err) throw cl::Error(ERR_OPENCL, "clFFT_ExecutePlannar"); } catchCLError; }
void CLSimulator::initializeClFFT() { /* x x x x * x x x x * x x x x * x x x x */ for (size_t x_idx = 0, x_val = _nX - 1; x_idx < _nX; ++x_idx, --x_val) { for (size_t y_idx = 0, y_val = _nY - 1; y_idx < _nY; ++y_idx, --y_val) { float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f)); _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance))); } } /* v v x x * v v x x * x x x x * x x x x */ for (size_t x_idx = 0, x_val = _nX - 1; x_idx < _nX; ++x_idx, --x_val) { for (size_t y_idx = _nY, y_val = 1; y_idx < _nFFTy - 1; ++y_idx, ++y_val) { float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f)); _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance))); } } /* v v v x * v v v x * x x x x * x x x x */ if (_nY > 1) { for (size_t x_idx = 0; x_idx < _nFFTx; ++x_idx) { _distances_real[x_idx + (_nFFTy - 1) * _nFFTx] = 0; } } /* v v v 0 * v v v 0 * x x x 0 * x x x 0 */ for (size_t x_idx = _nX, x_val = 1; x_idx < _nFFTx - 1; ++x_idx, ++x_val) { for (size_t y_idx = 0, y_val = _nY - 1; y_idx < _nY; ++y_idx, --y_val) { float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f)); _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance))); } } /* v v v 0 * v v v 0 * v v x 0 * x x x 0 */ for (size_t y_idx = 0; y_idx < _nFFTy; ++y_idx) { _distances_real[(_nFFTx - 1) + y_idx * _nFFTx] = 0; } /* v v v 0 * v v v 0 * v v x 0 * 0 0 0 0 */ for (size_t x_idx = _nX, x_val = 1; x_idx < _nFFTx - 1; ++x_idx, ++x_val) { for (size_t y_idx = _nY, y_val = 1; y_idx < _nFFTy - 1; ++y_idx, ++y_val) { float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f)); _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance))); } } /* v v v 0 * v v v 0 * v v v 0 * 0 0 0 0 */ assert(isPowerOfTwo(_nFFT)); assert(_nX >= 1 && _nY >= 1 && _nZ >= 1); assert((_nX >= _nY) && (_nY >= _nZ)); clFFT_Dim3 n = { static_cast<unsigned int>(_nFFTx), static_cast<unsigned int>(_nFFTy), static_cast<unsigned int>(_nFFTz) }; clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat; clFFT_Dimension dim; if (_nY == 1) { dim = clFFT_1D; } else if (_nZ == 1) { dim = clFFT_2D; } else { dim = clFFT_3D; } _p_cl = clFFT_CreatePlan(_wrapper.getContextC(), n, dim, dataFormat, &_err); handleClError(_err); _distances_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _distances_real.get(), &_err); _distances_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _sVals_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _sVals_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _convolution_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _convolution_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _distances_f_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _distances_f_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _sVals_f_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _sVals_f_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _convolution_f_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _convolution_f_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _kernel_convolution = cl::Kernel(_program, "convolution", &_err); handleClError(_kernel_convolution.setArg(0, _convolution_f_real_cl)); handleClError(_kernel_convolution.setArg(1, _convolution_f_imag_cl)); handleClError(_kernel_convolution.setArg(2, _distances_f_real_cl)); handleClError(_kernel_convolution.setArg(3, _distances_f_imag_cl)); handleClError(_kernel_convolution.setArg(4, _sVals_f_real_cl)); handleClError(_kernel_convolution.setArg(5, _sVals_f_imag_cl)); handleClError(_kernel_convolution.setArg(6, _scaleFFT)); handleClError(clFFT_ExecutePlannar(_wrapper.getQueueC(), _p_cl, 1, clFFT_Forward, _distances_real_cl(), _distances_imag_cl(), _distances_f_real_cl(), _distances_f_imag_cl(), 0, NULL, NULL)); _wrapper.getQueue().finish(); }
void CLSimulator::f_I_FFT_clFFT(const Receptor rec) { // initialize sVals_real for FFT switch (rec) { case AMPA: handleClError(_kernel_prepareFFT_AMPA.setArg(0, _states_cl[_ind_old])); _err = _wrapper.getQueue().enqueueNDRangeKernel(_kernel_prepareFFT_AMPA, cl::NullRange, cl::NDRange(_numNeurons), cl::NullRange, NULL, NULL); break; case NMDA: handleClError(_kernel_prepareFFT_NMDA.setArg(0, _states_cl[_ind_old])); _err = _wrapper.getQueue().enqueueNDRangeKernel(_kernel_prepareFFT_NMDA, cl::NullRange, cl::NDRange(_numNeurons), cl::NullRange, NULL, NULL); break; case GABAA: handleClError(_kernel_prepareFFT_GABAA.setArg(0, _states_cl[_ind_old])); _err = _wrapper.getQueue().enqueueNDRangeKernel(_kernel_prepareFFT_GABAA, cl::NullRange, cl::NDRange(_numNeurons), cl::NullRange, NULL, NULL); break; } _wrapper.getQueue().finish(); // transform sVals into frequency domain using FFT handleClError(clFFT_ExecutePlannar(_wrapper.getQueueC(), _p_cl, 1, clFFT_Forward, _sVals_real_cl(), _sVals_imag_cl(), _sVals_f_real_cl(), _sVals_f_imag_cl(), 0, NULL, NULL)); _wrapper.getQueue().finish(); // execute convolution in frequency domain _err = _wrapper.getQueue().enqueueNDRangeKernel(_kernel_convolution, cl::NullRange, cl::NDRange(_nFFT), cl::NullRange, NULL, NULL); _wrapper.getQueue().finish(); // inverse transform convolution_f using FFT handleClError(clFFT_ExecutePlannar(_wrapper.getQueueC(), _p_cl, 1, clFFT_Inverse, _convolution_f_real_cl(), _convolution_f_imag_cl(), _convolution_real_cl(), _convolution_imag_cl(), 0, NULL, NULL)); _wrapper.getQueue().finish(); // update sumFootprint array for current receptor switch (rec) { case AMPA: _err = _wrapper.getQueue().enqueueNDRangeKernel(_kernel_postConvolution_AMPA, cl::NullRange, cl::NDRange(_numNeurons), cl::NullRange, NULL, NULL); break; case NMDA: _err = _wrapper.getQueue().enqueueNDRangeKernel(_kernel_postConvolution_NMDA, cl::NullRange, cl::NDRange(_numNeurons), cl::NullRange, NULL, NULL); break; case GABAA: _err = _wrapper.getQueue().enqueueNDRangeKernel(_kernel_postConvolution_GABAA, cl::NullRange, cl::NDRange(_numNeurons), cl::NullRange, NULL, NULL); break; } }
int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType) { cl_int err = CL_SUCCESS; int iter; double t; uint64_t t0, t1; int mx = log2(n.x); int my = log2(n.y); int mz = log2(n.z); int length = n.x * n.y * n.z * batchSize; double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter; clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL }; clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL }; clFFT_Complex *data_i = NULL; clFFT_Complex *data_cl = NULL; clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL }; clFFT_Plan plan = NULL; cl_mem data_in = NULL; cl_mem data_out = NULL; cl_mem data_in_real = NULL; cl_mem data_in_imag = NULL; cl_mem data_out_real = NULL; cl_mem data_out_imag = NULL; if(dataFormat == clFFT_SplitComplexFormat) { data_i_split.real = (float *) malloc(sizeof(float) * length); data_i_split.imag = (float *) malloc(sizeof(float) * length); data_cl_split.real = (float *) malloc(sizeof(float) * length); data_cl_split.imag = (float *) malloc(sizeof(float) * length); if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag) { err = -1; log_error("Out-of-Resources\n"); goto cleanup; } } else { data_i = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); if(!data_i || !data_cl) { err = -2; log_error("Out-of-Resouces\n"); goto cleanup; } } data_iref.real = (double *) malloc(sizeof(double) * length); data_iref.imag = (double *) malloc(sizeof(double) * length); data_oref.real = (double *) malloc(sizeof(double) * length); data_oref.imag = (double *) malloc(sizeof(double) * length); if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag) { err = -3; log_error("Out-of-Resources\n"); goto cleanup; } int i; if(dataFormat == clFFT_SplitComplexFormat) { for(i = 0; i < length; i++) { data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_cl_split.real[i] = 0.0f; data_cl_split.imag[i] = 0.0f; data_iref.real[i] = data_i_split.real[i]; data_iref.imag[i] = data_i_split.imag[i]; data_oref.real[i] = data_iref.real[i]; data_oref.imag[i] = data_iref.imag[i]; } } else { for(i = 0; i < length; i++) { data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_cl[i].real = 0.0f; data_cl[i].imag = 0.0f; data_iref.real[i] = data_i[i].real; data_iref.imag[i] = data_i[i].imag; data_oref.real[i] = data_iref.real[i]; data_oref.imag[i] = data_iref.imag[i]; } } plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err ); if(!plan || err) { log_error("clFFT_CreatePlan failed\n"); goto cleanup; } //clFFT_DumpPlan(plan, stdout); if(dataFormat == clFFT_SplitComplexFormat) { data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err); if(!data_in_real || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err); if(!data_in_imag || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } if(testType == clFFT_OUT_OF_PLACE) { data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err); if(!data_out_real || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err); if(!data_out_imag || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } } else { data_out_real = data_in_real; data_out_imag = data_in_imag; } } else { data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err); if(!data_in) { log_error("clCreateBuffer failed\n"); goto cleanup; } if(testType == clFFT_OUT_OF_PLACE) { data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err); if(!data_out) { log_error("clCreateBuffer failed\n"); goto cleanup; } } else data_out = data_in; } err = CL_SUCCESS; t0 = mach_absolute_time(); if(dataFormat == clFFT_SplitComplexFormat) { for(iter = 0; iter < numIter; iter++) err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL); } else { for(iter = 0; iter < numIter; iter++) err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL); } err |= clFinish(queue); if(err) { log_error("clFFT_Execute\n"); goto cleanup; } t1 = mach_absolute_time(); t = subtractTimes(t1, t0); char temp[100]; sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize); log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp); if(dataFormat == clFFT_SplitComplexFormat) { err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL); err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL); } else { err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL); } if(err) { log_error("clEnqueueReadBuffer failed\n"); goto cleanup; } computeReferenceD(&data_oref, n, batchSize, dim, dir); double diff_avg, diff_max, diff_min; if(dataFormat == clFFT_SplitComplexFormat) { diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); if(diff_avg > eps_avg) log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); else log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); } else { clFFT_SplitComplex result_split; result_split.real = (float *) malloc(length*sizeof(float)); result_split.imag = (float *) malloc(length*sizeof(float)); convertInterleavedToSplit(&result_split, data_cl, length); diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); if(diff_avg > eps_avg) log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); else log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); free(result_split.real); free(result_split.imag); } cleanup: clFFT_DestroyPlan(plan); if(dataFormat == clFFT_SplitComplexFormat) { if(data_i_split.real) free(data_i_split.real); if(data_i_split.imag) free(data_i_split.imag); if(data_cl_split.real) free(data_cl_split.real); if(data_cl_split.imag) free(data_cl_split.imag); if(data_in_real) clReleaseMemObject(data_in_real); if(data_in_imag) clReleaseMemObject(data_in_imag); if(data_out_real && testType == clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out_real); if(data_out_imag && clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out_imag); } else { if(data_i) free(data_i); if(data_cl) free(data_cl); if(data_in) clReleaseMemObject(data_in); if(data_out && testType == clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out); } if(data_iref.real) free(data_iref.real); if(data_iref.imag) free(data_iref.imag); if(data_oref.real) free(data_oref.real); if(data_oref.imag) free(data_oref.imag); return err; }
void Convolutioner_FrequencyDomain_OpenCL::process(AudioInOutBuffers<float_type>& audio ) { // unsigned int _2B = audio.channelLength_ * 2; unsigned int _B = audio.channelLength_; unsigned int _C = audio.numOfChannels_; //numOfChannels unsigned int _P = partitionedIR_.get_numOfPartsPerChannel(); //numOfIRPartsPerChannel //. //_ if >>>latency<<< or >>>number of channels<<< changed: // set partitionedIR // recreate buffers // recreate fft plans if ( window_.get_inputBlockSize() != audio.channelLength_ || window_.get_numOfChannels() != audio.numOfChannels_) { //Setting partitionedIR if (window_.get_inputBlockSize() != audio.channelLength_) { partitionedIR_.setNewIRF( irf_, audio.channelLength_ ); _P = partitionedIR_.get_numOfPartsPerChannel(); //Recreate, initialize buffers, and set as kernel arguments: PIR //recreate bufferPIR_R_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P); bufferPIR_I_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P); //. //initialize bufferPIR_R_.set(partitionedIR_.real_ ); bufferPIR_I_.set(partitionedIR_.imaginary_); //. //set as kernel argument bufferPIR_R_.setAsKernelArgument(0, complexMultiplyAdd_kernel_); bufferPIR_I_.setAsKernelArgument(1, complexMultiplyAdd_kernel_); //. //.(Recreate...) } //. //Recreate initialize buffers, and set as kernel arguments: transform, FDL, accumulator //recreate /****/bufferTransform_R_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /****/bufferTransform_I_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /**********/bufferFDL_R_.recreate(CL_MEM_READ_WRITE, _2B * _C * _P ); /**********/bufferFDL_I_.recreate(CL_MEM_READ_WRITE, _2B * _C * _P ); /**/bufferAccumulator_R_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /**/bufferAccumulator_I_.recreate(CL_MEM_READ_WRITE, _2B * _C ); cpu_bufferAccumulator_R_ = new float_type[_2B * _C ]; cpu_bufferAccumulator_I_ = new float_type[_2B * _C ]; //. //initialize FDL with 0 bufferFDL_R_.fillWithZero(); bufferFDL_I_.fillWithZero(); lastInsertedDelayLineIdx = 0; //. //set as kernel argument /**********/bufferFDL_R_.setAsKernelArgument(2, complexMultiplyAdd_kernel_); /**********/bufferFDL_I_.setAsKernelArgument(3, complexMultiplyAdd_kernel_); /**/bufferAccumulator_R_.setAsKernelArgument(4, complexMultiplyAdd_kernel_); /**/bufferAccumulator_I_.setAsKernelArgument(5, complexMultiplyAdd_kernel_); //. //.(Recreate...) //Recreate plans clFFT_Dim3 dim; dim.x = _2B; dim.y = 1; dim.z = 1; fftPlan_ = clFFT_CreatePlan(context_, dim, clFFT_1D, clFFT_SplitComplexFormat, &lastCommandStatus_); //. } //update each time bufferGlobalParameters because of incrementing of lastInsertedDelayLineIdx /*(_2B, _C, _P, pir_C, FDL_LINE)*/ cpuData_bufferGlobalParameters_[0] = _2B; cpuData_bufferGlobalParameters_[1] = _C; cpuData_bufferGlobalParameters_[2] = _P; cpuData_bufferGlobalParameters_[3] = irf_->numOfChannels_; cpuData_bufferGlobalParameters_[4] = lastInsertedDelayLineIdx; bufferGlobalParameters_.set(cpuData_bufferGlobalParameters_); //. //Update channelsWindow window_.update( audio, /*history size*/ _B ); //. //Init >>bufferTransform<< bufferTransform_R_.set(window_.buffer_.data_); for(unsigned int i = 0; i < _2B * _C; ++i) cpu_bufferAccumulator_I_[i]=0; bufferTransform_I_.set(cpu_bufferAccumulator_I_); //. //Make fft of bufferTransform lastCommandStatus_ = clFFT_ExecutePlannar( cmdQueue_, fftPlan_, _C, clFFT_Forward, bufferTransform_R_, bufferTransform_I_, bufferTransform_R_, bufferTransform_I_, 0, NULL, NULL ); //. //Copy bufferTransform into bufferFDL (inserting new delay line) (real and imaginary part) clEnqueueCopyBuffer( cmdQueue_, bufferTransform_R_, bufferFDL_R_, 0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type), 0, NULL, NULL); clEnqueueCopyBuffer( cmdQueue_, bufferTransform_I_, bufferFDL_I_, 0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type), 0, NULL, NULL); //. //Increment host lastInsertedDelayLine lastInsertedDelayLineIdx = (lastInsertedDelayLineIdx + 1 ) % _P; //. //Execute kernel size_t globalWorkSize[1]; globalWorkSize[0] = _2B * _C /* == window_.get_allLength() */; lastCommandStatus_ = clEnqueueNDRangeKernel(cmdQueue_, complexMultiplyAdd_kernel_, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if(lastCommandStatus_ == -4) { std::cout << "Too much amount of memory must be allocated on the GPU due to lenght of impulse response and number of channels."; throw int(); } else if(lastCommandStatus_ != 0) { std::cout << "Error while sending clEnqueueNDRangeKernel."; throw int(); } //. //ifft of bufferAccumulator lastCommandStatus_ = clFFT_ExecutePlannar( cmdQueue_, fftPlan_, _C, clFFT_Inverse, bufferAccumulator_R_, bufferAccumulator_I_, bufferAccumulator_R_, bufferAccumulator_I_, 0, NULL, NULL ); //. //Copy from bufferAccumulator to cpu bufferAccumulator_R_.get(cpu_bufferAccumulator_R_); //. //Flushing and finishing clFlush(cmdQueue_); clFinish(cmdQueue_); //. //Write fftw vector form to audio.outputChannel[number of Channel] for (unsigned int channNum = 0; channNum < _C; ++channNum) for (unsigned sampleNum = 0; sampleNum < _B; ++sampleNum) audio.out_[channNum][sampleNum] = (cpu_bufferAccumulator_R_[channNum*_2B + _B + sampleNum])/_2B; //. }