hardware::code::Buffer::Buffer(const hardware::code::OpenClKernelParametersInterface& kernelParameters, const hardware::Device* device) : Opencl_Module(kernelParameters, device) { _copy_16_bytes = createKernel("copy_16_bytes") << "buffer.cl"; auto base_code = get_device()->getGaugefieldCode()->get_sources(); _clear_bytes = createKernel("clear_bytes") << base_code << "buffer.cl"; _clear_float4 = createKernel("clear_float4") << base_code << "buffer.cl"; }
int oclBilinearPyramid::compile() { clUpsample = 0; clDownsample = 0; if (!oclProgram::compile()) { return 0; } clUpsample = createKernel("clUpsample"); KERNEL_VALIDATE(clUpsample) clDownsample = createKernel("clDownsample"); KERNEL_VALIDATE(clDownsample) return 1; }
BOOL CRunDeconvFFT::BeforeCall() { pKernel = createKernel(m_pDocSrc2->GetImage()); IppStatus status = CallInit(); IppErrorMessage(m_initName, status); if (status < 0) return FALSE; return TRUE; }
int oclBvhTrimesh::compile() { clAABB = 0; clMorton = 0; clCreateNodes = 0; clLinkNodes = 0; clCreateLeaves = 0; clComputeAABBs = 0; if (!mRadixSort.compile()) { return 0; } if (!oclProgram::compile()) { return 0; } clAABB = createKernel("clAABB"); KERNEL_VALIDATE(clAABB) clMorton = createKernel("clMorton"); KERNEL_VALIDATE(clMorton) clCreateNodes = createKernel("clCreateNodes"); KERNEL_VALIDATE(clCreateNodes) clLinkNodes = createKernel("clLinkNodes"); KERNEL_VALIDATE(clLinkNodes) clCreateLeaves = createKernel("clCreateLeaves"); KERNEL_VALIDATE(clCreateLeaves) clComputeAABBs = createKernel("clComputeAABBs"); KERNEL_VALIDATE(clComputeAABBs) return 1; }
void AudioDSPKernelProcessor::initialize() { if (isInitialized()) return; ASSERT(!m_kernels.size()); // Create processing kernels, one per channel. for (unsigned i = 0; i < numberOfChannels(); ++i) m_kernels.append(createKernel()); m_initialized = true; m_hasJustReset = true; }
int oclConvolute::compile() { clIso2D = 0; clIso2Dsep = 0; clAniso2Dtang = 0; clAniso2Dorth = 0; if (!oclProgram::compile()) { return 0; } clIso2D = createKernel("clIso2D"); KERNEL_VALIDATE(clIso2D) clIso2Dsep = createKernel("clIso2Dsep"); KERNEL_VALIDATE(clIso2Dsep) clAniso2Dtang = createKernel("clAniso2Dtang"); KERNEL_VALIDATE(clAniso2Dtang) clAniso2Dorth = createKernel("clAniso2Dorth"); KERNEL_VALIDATE(clAniso2Dorth) return 1; }
void hardware::code::Real::fill_kernels() { basic_real_code = get_fundamental_sources() << "types.hpp" << "operations_real.cl"; logger.debug() << "Creating Real kernels..."; // Setting operations kernel get_elem_vec = createKernel("get_elem_vector") << basic_real_code << "real_access_vector_element.cl"; set_elem_vec = createKernel("set_elem_vector") << basic_real_code << "real_access_vector_element.cl"; // Single operations kernels ratio = createKernel("real_ratio") << basic_real_code << "real_ratio.cl"; product = createKernel("real_product") << basic_real_code << "real_product.cl"; sum = createKernel("real_sum") << basic_real_code << "real_sum.cl"; difference = createKernel("real_subtraction") << basic_real_code << "real_subtraction.cl"; // Update cgm kernels update_alpha_cgm = createKernel("update_alpha_cgm") << basic_real_code << "update_alpha_cgm.cl"; update_beta_cgm = createKernel("update_beta_cgm") << basic_real_code << "update_beta_cgm.cl"; update_zeta_cgm = createKernel("update_zeta_cgm") << basic_real_code << "update_zeta_cgm.cl"; }
int main(int argc, char **argv) { setbuf(stdout, NULL); glutInit(&argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH); glutInitWindowSize(DIM, DIM); glutCreateWindow("Simple OpenGL OpenCL"); glutIdleFunc(display); glutDisplayFunc(display); glutKeyboardFunc(keyboard); initCL(); createKernel("swirl.cl","swirlKernelSCB"); size_t addressbits,localSize,computeUnits,globalSize; openCLErrorID = clGetDeviceInfo(deviceHandle, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &computeUnits, NULL); openCLErrorID = clGetDeviceInfo(deviceHandle, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &localSize, NULL); openCLErrorID = clGetDeviceInfo(deviceHandle, CL_DEVICE_ADDRESS_BITS, sizeof(size_t), &addressbits, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS: %lu\nCL_DEVICE_MAX_WORK_GROUP_SIZE: %lu\nCL_DEVICE_ADDRESS_BITS: %lu\n",computeUnits,localSize,addressbits); // load bitmap Bitmap bmp = Bitmap("who-is-that.bmp"); if (bmp.isValid()) { for (int i = 0 ; i < DIM*DIM ; i++) { sourceColors[i] = bmp.getR(i/DIM, i%DIM) / 255.0f; } }else{ printf("couldnt load who-is-that.bmp"); exit(0); } // DONE: allocate memory at sourceDevPtr on the GPU and copy sourceColors into it. sourceDevPtr = clCreateBuffer( contextHandle, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, DIM*DIM*sizeof(float), sourceColors, &openCLErrorID); // DONE: allocate memory at swirlDevPtr for the unswirled image. swirlDevPtr = clCreateBuffer( contextHandle, CL_MEM_READ_WRITE, DIM*DIM*sizeof(float), NULL, &openCLErrorID); //DONE: Set Kernel Arguments openCLErrorID = clSetKernelArg(kernel,0,sizeof(cl_mem),&sourceDevPtr); openCLErrorID = clSetKernelArg(kernel,1,sizeof(cl_mem),&swirlDevPtr); openCLErrorID = clSetKernelArg(kernel,2,sizeof(cl_float),&a); openCLErrorID = clSetKernelArg(kernel,3,sizeof(cl_float),&b); glutMainLoop(); cleanup(); }
// ------------------------------------------------------------------------- void MorphOpenCL::recompile(Morphology::EOperationType opType, int coordsSize) { static int prevCoordsSize[Morphology::OT_Gradient+1] = {0}; SKernelParameters* kparams; cl::Kernel* kernel; if(opType == Morphology::OT_Erode) { kparams = &erodeParams; kernel = &kernelErode; } else if(opType == Morphology::OT_Dilate) { kparams = &dilateParams; kernel = &kernelDilate; } else if(opType == Morphology::OT_Gradient) { kparams = &gradientParams; kernel = &kernelGradient; } else { if(opType == Morphology::OT_TopHat || opType == Morphology::OT_BlackHat || opType == Morphology::OT_Open || opType == Morphology::OT_Close) { recompile(Morphology::OT_Erode, coordsSize); recompile(Morphology::OT_Dilate, coordsSize); } return; } if(!kparams->needRecompile || coordsSize == prevCoordsSize[opType]) return; QString opts = kparams->options + " -DCOORDS_SIZE=" + QString::number(coordsSize); prevCoordsSize[opType] = coordsSize; cl::Program prog = createProgram(kparams->programName,opts); *kernel = createKernel(prog, kparams->kernelName); }
/** Updates the internal state of the filter. */ int AnalogFilter::updateInternal() { /* int oldKernelLength = nKernelLength; nKernelLength = (int) dKernelLength; if (nKernelLength < 1) { nKernelLength = 1; dKernelLength = 1; } */ // Reallocate memory for kernel if (oldKernelLength != nKernelLength) { delete[] filter_kernel; filter_kernel = new double[nKernelLength]; createKernel(nKernelLength); oldKernelLength = nKernelLength; } return 0; }
Kernel Program::createKernel(const string& name) const { return createKernel(name.c_str()); }
int main(int argc, char **argv) { //single precision real number //row major m rows by n columns int performance_level = atoi(argv[1]); int m = atoi(argv[2]);//m and n should be mod 32 int n = atoi(argv[3]); int batchSize = atoi(argv[4]); //n should be twice as m for now. if (n != 3 * m) { std::cout << "n should be three times as m for now." << std::endl; return 1; } //malloc input data std::complex<float> *CPU_A = (std::complex<float>*)malloc(m*n*batchSize*sizeof(std::complex<float>)); //temperay buffer to hold the intermediate result after the first kernel std::complex<float> *CPU_A_TEMP = (std::complex<float>*)malloc(m*n*batchSize*sizeof(std::complex<float>)); std::complex<float> *CPU_A_OUT = (std::complex<float>*)malloc(m*n*batchSize*sizeof(std::complex<float>)); int miniBatchSize = n / m;//which is 3 for now for (int k = 0; k < batchSize; k++) { for (int q = 0; q < miniBatchSize; q++) { for (int i = 0; i < m; i++) { for (int j = 0; j < n/3; j++) { CPU_A[k*m*n + q*n/3 + i*n + j] = { (float)(i*n + j + k + q), (float)(i*n + j + k + 2*q) }; } } } } //init OpenCL cl_int err; cl_platform_id platform; cl_device_id device; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context context; cl_command_queue queue; cl_kernel kernel1, kernel2; cl_event event1, event2; char *source1, *source2; platform = getPlatform(PLATFORM_NAME); assert(platform != NULL); device = getDevice(platform, DEVICE_NAME); assert(device != NULL); props[1] = (cl_context_properties)platform; context = clCreateContext(props, 1, &device, NULL, NULL, &err); assert(context != NULL); queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); assert(queue != NULL); cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_WRITE, (m * n * batchSize) * sizeof(*CPU_A), NULL, &err); assert(bufA != NULL); //move memory from host to device err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, (m*n*batchSize) * sizeof(*CPU_A), CPU_A, 0, NULL, NULL); //compile kernel source1 = loadFile(KERNEL_SOURCE1); assert(source1 != NULL); kernel1 = createKernel(source1, context, BUILD_OPTIONS, &err); assert(kernel1 != NULL); source2 = loadFile(KERNEL_SOURCE2); assert(source2 != NULL); kernel2 = createKernel(source2, context, BUILD_OPTIONS, &err); assert(kernel2 != NULL); //launch kernel size_t localWorkSize2[1] = { 256 }; //calculate number of work groups //each work group works on a 32 by 32 block //the whole matrix has ((m-1)/32+1) * ((n-1)/32+1) = 23 x 23 blocks //the upper triangle of which (including the diagional) is //23*(23+1)/2 = 276 //so the formula is ((m-1)/32+1) * ((n/3-1)/32+1+1) / 2 int num_wg = ((m - 1) / 32 + 1) * ((m - 1) / 32 + 1 + 1) / 2; size_t globalWorkSize2[1] = { batchSize * num_wg * miniBatchSize * 256 }; err = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &bufA); assert(err == CL_SUCCESS); /* err = clSetKernelArg(kernel, 1, sizeof(cl_uint), &m); assert(err == CL_SUCCESS); err = clSetKernelArg(kernel, 2, sizeof(cl_uint), &num_wg); assert(err == CL_SUCCESS); */ //second pass kernel sizes size_t localWorkSize1[1] = { 256 }; size_t globalWorkSize1[1] = { batchSize*(313)*256 }; // 313 is calculated by the permutation algorithm given input 3 and 729 err = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &bufA); assert(err == CL_SUCCESS); if (performance_level == 0) { //check result //first launch kernel 1 that swaps lines err = clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, globalWorkSize1, localWorkSize1, 0, NULL, &event1); assert(err == CL_SUCCESS); err = clFinish(queue); assert(err == CL_SUCCESS); err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (batchSize*m*n) * sizeof(*CPU_A_TEMP), CPU_A_TEMP, 0, NULL, NULL); assert(err == CL_SUCCESS); //second pass that transpose each minibatch err = clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, globalWorkSize2, localWorkSize2, 0, NULL, &event2); assert(err == CL_SUCCESS); err = clFinish(queue); } else if (performance_level == 1) { //check kernel performance //second pass err = clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, globalWorkSize1, localWorkSize1, 0, NULL, &event1); assert(err == CL_SUCCESS); clWaitForEvents(1, &event1); assert(err == CL_SUCCESS); err = clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, globalWorkSize2, localWorkSize2, 0, NULL, &event2); assert(err == CL_SUCCESS); clWaitForEvents(1, &event2); assert(err == CL_SUCCESS); cl_ulong start1, end1, start2, end2; cl_ulong KernelTime1 = 0; cl_ulong KernelTime2 = 0; int iteration = 20; for (int i = 0; i < iteration; i++) { event1 = NULL; event2 = NULL; err = clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, globalWorkSize1, localWorkSize1, 0, NULL, &event1); assert(err == CL_SUCCESS); clWaitForEvents(1, &event1); assert(err == CL_SUCCESS); err = clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, globalWorkSize2, localWorkSize2, 0, NULL, &event2); assert(err == CL_SUCCESS); clWaitForEvents(1, &event2); assert(err == CL_SUCCESS); err = clGetEventProfilingInfo(event1, CL_PROFILING_COMMAND_START, sizeof(start1), &start1, NULL); err = clGetEventProfilingInfo(event1, CL_PROFILING_COMMAND_END, sizeof(end1), &end1, NULL); err = clGetEventProfilingInfo(event2, CL_PROFILING_COMMAND_START, sizeof(start2), &start2, NULL); err = clGetEventProfilingInfo(event2, CL_PROFILING_COMMAND_END, sizeof(end2), &end2, NULL); KernelTime1 += (end1 - start1); KernelTime2 += (end2 - start2); } //KernelTime is in ns size_t peakGBs = 512; std::cout << "the first kernel takes " << KernelTime1/iteration << " ns in average." << std::endl; std::cout << "the second kernel takes " << KernelTime2/iteration << " ns in average." << std::endl; size_t KernelGBs = 2 * sizeof(std::complex<float>) * m * n * batchSize / ((KernelTime1 + KernelTime2) / iteration); std::cout << " GBs: " << KernelGBs << " GBs" << std::endl; float efficiency = ((float)KernelGBs) / (float)peakGBs; std::cout << " efficiency: " << efficiency * 100 << "%" << std::endl; } //move memory from device to host err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (batchSize*m*n) * sizeof(*CPU_A_OUT), CPU_A_OUT, 0, NULL, NULL); assert(err == CL_SUCCESS); if (performance_level == 0) { //check result int error = 0; for (int k = 0; k < batchSize; k++) { for (int q = 0; q < miniBatchSize; q++) { for (int i = 0; i < m; i++) { for (int j = 0; j < n / 3; j++) { //std::complex<float> out = CPU_A_TEMP[k*m*n + q*n/3 + i*n + j]; std::complex<float> out = CPU_A_TEMP[k*m*n + q*m*n/3 + i*n/3 +j]; std::complex<float> in = CPU_A[k*m*n + q*n / 3 + i*n + j]; if (in != out) { error = 1; break; } } } } } if (error == 0) { std::cout << "first kernel correstness passed." << std::endl; } else { std::cout << "first kernel correctness failed." << std::endl; } for (int k = 0; k < batchSize; k++) { for (int q = 0; q < miniBatchSize; q++) { for (int i = 0; i < m; i++) { for (int j = 0; j < n / 3; j++) { std::complex<float> out = CPU_A_OUT[k*m*n + q*m*n/3 + j*m + i]; std::complex<float> in = CPU_A[k*m*n + q*n/3 + i*n + j]; if (in != out) { error = 1; break; } } } } } if (error == 0) { std::cout << "correstness passed." << std::endl; } else { std::cout << "correctness failed." << std::endl; } } //releasing the objects err = clReleaseMemObject(bufA); err = clReleaseEvent(event1); err = clReleaseEvent(event2); err = clReleaseKernel(kernel1); err = clReleaseKernel(kernel2); err = clReleaseCommandQueue(queue); err = clReleaseContext(context); free(CPU_A_TEMP); free(CPU_A); free(CPU_A_OUT); }
/** Filter a response signal of the neural microcircuit. \param R Response of the neural microcircuit. \param X Target vector where to save the results. \param indices Indices where to store the results in X. \return -1 if an error occured, 1 for success. */ int AnalogFilter::filter(const double* R, double* X, int* indices) { if (R == 0) { TheCsimError.add("AnalogFilter::filter: Input is a NULL pointer!\n"); return -1; } if (X == 0) { TheCsimError.add("AnalogFilter::filter: Target vector is a NULL pointer!\n"); return -1; } deque<double>::iterator p; double f_value; int i, j; nInputAvailable++; if ((nInputAvailable) <= nKernelLength) { // Length of collected input data is shorter than // desired size of filter kernel // Calculate a new shorter kernel createKernel(nInputAvailable); } // Put the new data into the queues for (i=0; i<nChannels; i++) { // Delete the oldest element in the queue if (nInputAvailable > nKernelLength) dataQueues[i]->pop_front(); // Add the new value at the end of the queue dataQueues[i]->push_back(R[i]); } int nToFilter = min(nInputAvailable, nKernelLength); // Filter all analog channels for (i=0; i<nChannels; i++) { p = dataQueues[i]->begin(); f_value = 0.0; for (j=0; j<nToFilter; j++) { // Calculate filtered value f_value += filter_kernel[j] * *p; if (p != dataQueues[i]->end()) { if (j < (nToFilter - 1)) // Do not advance the iterator for the last object, since we want // to change its content p++; } else { TheCsimError.add("AnalogFilter::filter: Data was lost before filtering!\n"); return -1; } } // Store the filtered value: Replace the value of the last input *p = f_value; if (indices) X[indices[i]] = f_value; else X[i] = f_value; } return 1; }
void bluesteinsFFTGpu(const char* const argv[],const unsigned n, const unsigned orign,const unsigned size) { const unsigned powM = (unsigned) log2(n); printf("Compiling Bluesteins Program..\n"); compileProgram(argv, "fft.h", "kernels/bluesteins.cl"); printf("Creating Kernel\n"); for (unsigned i = 0; i < deviceCount; ++i) { createKernel(i, "bluesteins"); } const unsigned sizePerGPU = size / deviceCount; for (unsigned i = 0; i < deviceCount; ++i) { workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU : (size - workOffset[i]); allocateDeviceMemoryBS(i , workSize[i], workOffset[i]); clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]); clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]); clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]); clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]); clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]); clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]); clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n); clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign); clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM); clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize); if ((i + 1) < deviceCount) { workOffset[i + 1] = workOffset[i] + workSize[i]; } } size_t localWorkSize[] = {blockSize}; for (unsigned i = 0; i < deviceCount; ++i) { size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; // kernel non blocking execution runKernel(i, localWorkSize, globalWorkSize); } h_Rreal = h_Hreal; h_Rimag = h_Himag; for (unsigned i = 0; i < deviceCount; ++i) { copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i], workSize[i]); copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i], workSize[i]); } // wait for copy event const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone); checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents"); printGpuTime(); }
// ------------------------------------------------------------------------- cl::Kernel MorphOpenCL::createKernel(const cl::Program& prog, const QString& kernelName) { std::string b = kernelName.toStdString(); return createKernel(prog, b.c_str()); }
theKernels(cl_context GPUContext, cl_device_id cdDevice) { GPUContext_K = GPUContext; cdDevice_K = cdDevice; if(device_use) { createKernel("pairwiseDistanceKernel","../../../src/E_PairwiseDistance.cl",0); } else createKernel("pairwiseDistanceKernel","../../../src/CPU_PairwiseDistance.cl",0); createKernel("argminKernel","../../../src/argminKernel.cl",1); createKernel("argmaxKernel","../../../src/argmaxKernel.cl",2); createKernel("minKernel","../../../src/minKernel.cl",3); createKernel("maxKernel","../../../src/maxKernel.cl",4); if(device_use) createKernel("blockwise_distance_kernel","../../../src/E_blockwise_distance_kernel.cl",5); else createKernel("blockwise_distance_kernel","../../../src/CPU_blockwise_distance_kernel.cl",5); createKernel("blockwise_filter_kernel","../../../src/blockwise_filter_kernel.cl",6); createKernel("cell_histogram_kernel","../../../src/cell_histogram_kernel.cl",7); createKernel("cellHistogramKernel1","../../../src/cellHistogramKernel1.cl",8); createKernel("cellHistogramKernel2","../../../src/cellHistogramKernel2.cl",9); createKernel("cellHistogramKernel3","../../../src/cellHistogramKernel3.cl",10); }
int oclFluid3D::compile() { clInitFluid = 0; clIntegrateForce = 0; clIntegrateVelocity = 0; clHash = 0; clReorder = 0; clInitBounds = 0; if (!mRadixSort.compile()) { return 0; } if (!oclProgram::compile()) { return 0; } clInitFluid = createKernel("clInitFluid"); KERNEL_VALIDATE(clInitFluid) clIntegrateForce = createKernel("clIntegrateForce"); KERNEL_VALIDATE(clIntegrateForce) clIntegrateVelocity = createKernel("clIntegrateVelocity"); KERNEL_VALIDATE(clIntegrateVelocity) clHash = createKernel("clHash"); KERNEL_VALIDATE(clHash) clReorder = createKernel("clReorder"); KERNEL_VALIDATE(clReorder) clInitBounds = createKernel("clInitBounds"); KERNEL_VALIDATE(clInitBounds) clFindBounds = createKernel("clFindBounds"); KERNEL_VALIDATE(clFindBounds) clCalculateDensity = createKernel("clCalculateDensity"); KERNEL_VALIDATE(clCalculateDensity) clCalculateForces = createKernel("clCalculateForces"); KERNEL_VALIDATE(clCalculateForces) clGravity = createKernel("clGravity"); KERNEL_VALIDATE(clGravity) clClipBox = createKernel("clClipBox"); KERNEL_VALIDATE(clClipBox) // init fluid parameters clSetKernelArg(clInitFluid, 0, sizeof(cl_mem), bfParams); clEnqueueTask(mContext.getDevice(0), clInitFluid, 0, NULL, clInitFluid.getEvent()); bfParams.map(CL_MAP_READ); return bindBuffers(); }
void cluster_t::init_opencl(){ if(run_gpu){ // initialize the GPU if necessary #ifdef USE_GPU debug_opencl = false; proxmap_t::init_opencl(); cerr<<"Initializing OpenCL for cluster sub class\n"; cerr<<"P is "<<p<<", Workgroup width is "<<variable_blocks<<endl; // CREATE KERNELS createKernel("init_U",kernel_init_U); createKernel("update_U",kernel_update_U); createKernel("update_map_distance",kernel_update_map_distance); createKernel("init_v_project_coeff",kernel_init_v_project_coeff); createKernel("store_U_project",kernel_store_U_project); createKernel("store_U_project_prev",kernel_store_U_project_prev); createKernel("iterate_projection",kernel_iterate_projection); createKernel("evaluate_obj",kernel_evaluate_obj); createKernel("get_U_norm_diff",kernel_get_U_norm_diff); cerr<<"Kernels created\n"; // CREATE BUFFERS createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U",buffer_U); createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U_prev",buffer_U_prev); createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U_project",buffer_U_project); createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U_project_orig",buffer_U_project_orig); createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U_project_prev",buffer_U_project_prev); createBuffer<float>(CL_MEM_READ_WRITE,triangle_dim,"buffer_V_project_coeff",buffer_V_project_coeff); createBuffer<float>(CL_MEM_READ_ONLY,n*p,"buffer_rawdata",buffer_rawdata); createBuffer<float>(CL_MEM_READ_ONLY,triangle_dim,"buffer_weights",buffer_weights); createBuffer<int>(CL_MEM_READ_ONLY,n,"buffer_offsets",buffer_offsets); createBuffer<float>(CL_MEM_READ_WRITE,variable_blocks,"buffer_variable_block_norms1",buffer_variable_block_norms1); createBuffer<float>(CL_MEM_READ_WRITE,variable_blocks,"buffer_variable_block_norms2",buffer_variable_block_norms2); createBuffer<float>(CL_MEM_READ_WRITE,n*variable_blocks,"buffer_subject_variable_block_norms",buffer_subject_variable_block_norms); createBuffer<float>(CL_MEM_READ_ONLY,1,"buffer_unweighted_lambda",buffer_unweighted_lambda); createBuffer<float>(CL_MEM_READ_ONLY,1,"buffer_dist_func",buffer_dist_func); createBuffer<float>(CL_MEM_READ_ONLY,1,"buffer_rho",buffer_rho); createBuffer<float>(CL_MEM_READ_WRITE,n,"buffer_n_norms",buffer_n_norms); createBuffer<float>(CL_MEM_READ_WRITE,triangle_dim,"buffer_n2_norms",buffer_n2_norms); ////createBuffer<>(CL_MEM_READ_ONLY,,"buffer_",buffer_); cerr<<"GPU Buffers created\n"; // initialize anything here writeToBuffer(buffer_U,n*p,U,"buffer_U"); writeToBuffer(buffer_U_prev,n*p,U_prev,"buffer_U_prev"); writeToBuffer(buffer_U_project,n*p,U_project,"buffer_U_project"); writeToBuffer(buffer_U_project_orig,n*p,U_project_orig,"buffer_U_project_orig"); writeToBuffer(buffer_rawdata,n*p,rawdata,"buffer_rawdata"); writeToBuffer(buffer_offsets,n,offsets,"buffer_offsets"); cerr<<"GPU Buffers initialized\n"; // SET KERNEL ARGUMENTS HERE int arg; //int kernelWorkGroupSize; arg = 0; setArg(kernel_update_U,arg,p,"kernel_update_U"); setArg(kernel_update_U,arg,*buffer_dist_func,"kernel_update_U"); setArg(kernel_update_U,arg,*buffer_rho,"kernel_update_U"); setArg(kernel_update_U,arg,*buffer_U,"kernel_update_U"); setArg(kernel_update_U,arg,*buffer_U_prev,"kernel_update_U"); setArg(kernel_update_U,arg,*buffer_rawdata,"kernel_update_U"); setArg(kernel_update_U,arg,*buffer_U_project,"kernel_update_U"); arg = 0; setArg(kernel_init_U,arg,p,"kernel_init_U"); setArg(kernel_init_U,arg,*buffer_rawdata,"kernel_init_U"); setArg(kernel_init_U,arg,*buffer_U,"kernel_init_U"); setArg(kernel_init_U,arg,*buffer_U_project,"kernel_init_U"); setArg(kernel_init_U,arg,*buffer_U_project_orig,"kernel_init_U"); arg = 0; setArg(kernel_update_map_distance,arg,n,"kernel_update_map_distance"); setArg(kernel_update_map_distance,arg,p,"kernel_update_map_distance"); setArg(kernel_update_map_distance,arg,*buffer_U,"kernel_update_map_distance"); setArg(kernel_update_map_distance,arg,*buffer_U_project,"kernel_update_map_distance"); setArg(kernel_update_map_distance,arg,*buffer_variable_block_norms1,"kernel_update_map_distance"); setArg(kernel_update_map_distance,arg,*buffer_variable_block_norms2,"kernel_update_map_distance"); setArg(kernel_update_map_distance,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_update_map_distance"); setArg(kernel_update_map_distance,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_update_map_distance"); arg = 0; setArg(kernel_init_v_project_coeff,arg,n,"kernel_init_v_project_coeff"); setArg(kernel_init_v_project_coeff,arg,p,"kernel_init_v_project_coeff"); setArg(kernel_init_v_project_coeff,arg,variable_blocks,"kernel_init_v_project_coeff"); setArg(kernel_init_v_project_coeff,arg,*buffer_unweighted_lambda,"kernel_init_v_project_coeff"); setArg(kernel_init_v_project_coeff,arg,*buffer_weights,"kernel_init_v_project_coeff"); setArg(kernel_init_v_project_coeff,arg,*buffer_U_project_orig,"kernel_init_v_project_coeff"); setArg(kernel_init_v_project_coeff,arg,*buffer_V_project_coeff,"kernel_init_v_project_coeff"); setArg(kernel_init_v_project_coeff,arg,*buffer_offsets,"kernel_init_v_project_coeff"); setArg(kernel_init_v_project_coeff,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_init_v_project_coeff"); arg = 0; setArg(kernel_store_U_project,arg,p,"kernel_store_U_project"); setArg(kernel_store_U_project,arg,*buffer_U,"kernel_store_U_project"); setArg(kernel_store_U_project,arg,*buffer_U_project,"kernel_store_U_project"); setArg(kernel_store_U_project,arg,*buffer_U_project_orig,"kernel_store_U_project"); arg = 0; setArg(kernel_store_U_project_prev,arg,p,"kernel_store_U_project_prev"); setArg(kernel_store_U_project_prev,arg,*buffer_U_project,"kernel_store_U_project_prev"); setArg(kernel_store_U_project_prev,arg,*buffer_U_project_prev,"kernel_store_U_project_prev"); arg = 0; setArg(kernel_iterate_projection,arg,n,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,p,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,variable_blocks,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,*buffer_U,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,*buffer_U_project,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,*buffer_U_project_orig,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,*buffer_U_project_prev,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,*buffer_offsets,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,*buffer_weights,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,*buffer_V_project_coeff,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,*buffer_subject_variable_block_norms,"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_iterate_projection"); setArg(kernel_iterate_projection,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_iterate_projection"); arg = 0; setArg(kernel_evaluate_obj,arg,n,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,p,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,variable_blocks,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_offsets,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_rawdata,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_U,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_U_prev,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_U_project,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_weights,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_V_project_coeff,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_n_norms,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,*buffer_n2_norms,"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_evaluate_obj"); setArg(kernel_evaluate_obj,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_evaluate_obj"); arg = 0; setArg(kernel_get_U_norm_diff,arg,n,"kernel_get_U_norm_diff"); setArg(kernel_get_U_norm_diff,arg,p,"kernel_get_U_norm_diff"); setArg(kernel_get_U_norm_diff,arg,variable_blocks,"kernel_get_U_norm_diff"); setArg(kernel_get_U_norm_diff,arg,*buffer_U,"kernel_get_U_norm_diff"); setArg(kernel_get_U_norm_diff,arg,*buffer_U_prev,"kernel_get_U_norm_diff"); setArg(kernel_get_U_norm_diff,arg,*buffer_n_norms,"kernel_get_U_norm_diff"); setArg(kernel_get_U_norm_diff,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_get_U_norm_diff"); //setArg(kernel_reduce_weights2,arg,g_people,"kernel_reduce_weights2"); //kernelWorkGroupSize = kernel_reduce_weights2->getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0], &err); //clSafe(err,"get workgroup size kernel reduce_weights2"); //cerr<<"reduce_weights2 kernel work group size is "<<kernelWorkGroupSize<<endl; cerr<<"GPU kernel arguments assigned.\n"; #endif } }