float *OpenClFindNearestNeighbors( cl_context context, int numRecords, std::vector<LatLong> &locations,float lat,float lng, int timing) { // 1. set up kernel cl_kernel NN_kernel; cl_int status; cl_program cl_NN_program; cl_NN_program = cl_compileProgram( (char *)"nearestNeighbor_kernel.cl",NULL); NN_kernel = clCreateKernel( cl_NN_program, "NearestNeighbor", &status); status = cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel",true); if(status)exit(1); // 2. set up memory on device and send ipts data to device // copy ipts(1,2) to device // also need to alloate memory for the distancePoints cl_mem d_locations; cl_mem d_distances; cl_int error=0; d_locations = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(LatLong) * numRecords, NULL, &error); d_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * numRecords, NULL, &error); cl_command_queue command_queue = cl_getCommandQueue(); cl_event writeEvent,kernelEvent,readEvent; error = clEnqueueWriteBuffer(command_queue, d_locations, 1, // change to 0 for nonblocking write 0, // offset sizeof(LatLong) * numRecords, &locations[0], 0, NULL, &writeEvent); // 3. send arguments to device cl_int argchk; argchk = clSetKernelArg(NN_kernel, 0, sizeof(cl_mem), (void *)&d_locations); argchk |= clSetKernelArg(NN_kernel, 1, sizeof(cl_mem), (void *)&d_distances); argchk |= clSetKernelArg(NN_kernel, 2, sizeof(int), (void *)&numRecords); argchk |= clSetKernelArg(NN_kernel, 3, sizeof(float), (void *)&lat); argchk |= clSetKernelArg(NN_kernel, 4, sizeof(float), (void *)&lng); cl_errChk(argchk,"ERROR in Setting Nearest Neighbor kernel args",true); // 4. enqueue kernel size_t globalWorkSize[1]; globalWorkSize[0] = numRecords; if (numRecords % 64) globalWorkSize[0] += 64 - (numRecords % 64); //printf("Global Work Size: %zu\n",globalWorkSize[0]); error = clEnqueueNDRangeKernel( command_queue, NN_kernel, 1, 0, globalWorkSize,NULL, 0, NULL, &kernelEvent); cl_errChk(error,"ERROR in Executing Kernel NearestNeighbor",true); // 5. transfer data off of device // create distances std::vector float *distances = (float *)malloc(sizeof(float) * numRecords); error = clEnqueueReadBuffer(command_queue, d_distances, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * numRecords, distances, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) { clFinish(command_queue); cl_ulong eventStart,eventEnd,totalTime=0; printf("# Records\tWrite(s) [size]\t\tKernel(s)\tRead(s) [size]\t\tTotal(s)\n"); printf("%d \t",numRecords); // Write Buffer error = clGetEventProfilingInfo(writeEvent,CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&eventStart,NULL); cl_errChk(error,"ERROR in Event Profiling (Write Start)",true); error = clGetEventProfilingInfo(writeEvent,CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&eventEnd,NULL); cl_errChk(error,"ERROR in Event Profiling (Write End)",true); printf("%f [%.2fMB]\t",(float)((eventEnd-eventStart)/1e9),(float)((sizeof(LatLong) * numRecords)/1e6)); totalTime += eventEnd-eventStart; // Kernel error = clGetEventProfilingInfo(kernelEvent,CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&eventStart,NULL); cl_errChk(error,"ERROR in Event Profiling (Kernel Start)",true); error = clGetEventProfilingInfo(kernelEvent,CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&eventEnd,NULL); cl_errChk(error,"ERROR in Event Profiling (Kernel End)",true); printf("%f\t",(float)((eventEnd-eventStart)/1e9)); totalTime += eventEnd-eventStart; // Read Buffer error = clGetEventProfilingInfo(readEvent,CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&eventStart,NULL); cl_errChk(error,"ERROR in Event Profiling (Read Start)",true); error = clGetEventProfilingInfo(readEvent,CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&eventEnd,NULL); cl_errChk(error,"ERROR in Event Profiling (Read End)",true); printf("%f [%.2fMB]\t",(float)((eventEnd-eventStart)/1e9),(float)((sizeof(float) * numRecords)/1e6)); totalTime += eventEnd-eventStart; printf("%f\n\n",(float)(totalTime/1e9)); } // 6. return finalized data and release buffers clReleaseMemObject(d_locations); clReleaseMemObject(d_distances); return distances; }
/*! */ cl_kernel* cl_precompileKernels(char* buildOptions) { // Compile each program and create the kernel objects printf("Precompiling kernels...\n"); cl_time totalstart, totalend; cl_time start, end; cl_getTime(&totalstart); // Creating descriptors kernel cl_getTime(&start); program_list[1] = cl_compileProgram("CLSource/createDescriptors_kernel.cl", buildOptions, false); cl_getTime(&end); events->newCompileEvent(cl_computeTime(start, end), "createDescriptors"); kernel_list[KERNEL_SURF_DESC] = cl_createKernel(program_list[1], "createDescriptors_kernel"); // Get orientation kernels cl_getTime(&start); program_list[4] = cl_compileProgram("CLSource/getOrientation_kernels.cl", buildOptions, false); cl_getTime(&end); events->newCompileEvent(cl_computeTime(start, end), "Orientation"); kernel_list[KERNEL_GET_ORIENT1] = cl_createKernel(program_list[4], "getOrientationStep1"); kernel_list[KERNEL_GET_ORIENT2] = cl_createKernel(program_list[4], "getOrientationStep2"); // Hessian determinant kernel cl_getTime(&start); program_list[0] = cl_compileProgram("CLSource/hessianDet_kernel.cl", buildOptions, false); cl_getTime(&end); events->newCompileEvent(cl_computeTime(start, end), "hessian_det"); kernel_list[KERNEL_BUILD_DET] = cl_createKernel(program_list[0], "hessian_det"); // Integral image kernels cl_getTime(&start); program_list[6] = cl_compileProgram("CLSource/integralImage_kernels.cl", buildOptions, false); cl_getTime(&end); events->newCompileEvent(cl_computeTime(start, end), "IntegralImage"); kernel_list[KERNEL_SCAN] = cl_createKernel(program_list[6], "scan"); kernel_list[KERNEL_SCAN4] = cl_createKernel(program_list[6], "scan4"); kernel_list[KERNEL_SCANIMAGE] = cl_createKernel(program_list[6], "scanImage"); kernel_list[KERNEL_TRANSPOSE] = cl_createKernel(program_list[6], "transpose"); kernel_list[KERNEL_TRANSPOSEIMAGE] = cl_createKernel(program_list[6], "transposeImage"); // Nearest neighbor kernels cl_getTime(&start); program_list[5] = cl_compileProgram("CLSource/nearestNeighbor_kernel.cl", buildOptions, false); cl_getTime(&end); events->newCompileEvent(cl_computeTime(start, end), "NearestNeighbor"); kernel_list[KERNEL_NN] = cl_createKernel(program_list[5], "NearestNeighbor"); // Non-maximum suppression kernel cl_getTime(&start); program_list[3] = cl_compileProgram("CLSource/nonMaxSuppression_kernel.cl", buildOptions, false); cl_getTime(&end); events->newCompileEvent(cl_computeTime(start, end), "NonMaxSuppression"); kernel_list[KERNEL_NON_MAX_SUP] = cl_createKernel(program_list[3], "non_max_supression_kernel"); // Normalization of descriptors kernel cl_getTime(&start); program_list[2] = cl_compileProgram("CLSource/normalizeDescriptors_kernel.cl", buildOptions, false); cl_getTime(&end); events->newCompileEvent(cl_computeTime(start, end), "normalize"); kernel_list[KERNEL_NORM_DESC] = cl_createKernel(program_list[2], "normalizeDescriptors"); cl_getTime(&totalend); printf("\tTime for Off-Critical Path Compilation: %.3f milliseconds\n\n", cl_computeTime(totalstart, totalend)); return kernel_list; }
/*------------------------------------------------------ ** ForwardSub() -- Forward substitution of Gaussian ** elimination. **------------------------------------------------------ */ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing){ // 1. set up kernels cl_kernel fan1_kernel,fan2_kernel; cl_int status=0; cl_program gaussianElim_program; cl_event writeEvent,kernelEvent,readEvent; float writeTime=0,readTime=0,kernelTime=0; float writeMB=0,readMB=0; gaussianElim_program = cl_compileProgram( (char *)"gaussianElim_kernels.cl",NULL); fan1_kernel = clCreateKernel( gaussianElim_program, "Fan1", &status); status = cl_errChk(status, (char *)"Error Creating Fan1 kernel",true); if(status)exit(1); fan2_kernel = clCreateKernel( gaussianElim_program, "Fan2", &status); status = cl_errChk(status, (char *)"Error Creating Fan2 kernel",true); if(status)exit(1); // 2. set up memory on device and send ipts data to device cl_mem a_dev, b_dev, m_dev; cl_int error=0; a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*size*size, NULL, &error); b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*size, NULL, &error); m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size * size, NULL, &error); command_queue = cl_getCommandQueue(); error = clEnqueueWriteBuffer(command_queue, a_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size*size, a, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); error = clEnqueueWriteBuffer(command_queue, b_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size, b, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); error = clEnqueueWriteBuffer(command_queue, m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size*size, m, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6); // 3. Determine block sizes size_t globalWorksizeFan1[1]; size_t globalWorksizeFan2[2]; size_t localWorksizeFan1Buf[1]={BLOCK_SIZE_0}; size_t localWorksizeFan2Buf[2]={BLOCK_SIZE_1_X, BLOCK_SIZE_1_Y}; size_t *localWorksizeFan1=NULL; size_t *localWorksizeFan2=NULL; globalWorksizeFan1[0] = size; globalWorksizeFan2[0] = size; globalWorksizeFan2[1] = size; if(localWorksizeFan1Buf[0]){ localWorksizeFan1=localWorksizeFan1Buf; globalWorksizeFan1[0]=(int)ceil(globalWorksizeFan1[0]/(double)localWorksizeFan1Buf[0])*localWorksizeFan1Buf[0]; } if(localWorksizeFan2Buf[0]){ localWorksizeFan2=localWorksizeFan2Buf; globalWorksizeFan2[0]=(int)ceil(globalWorksizeFan2[0]/(double)localWorksizeFan2Buf[0])*localWorksizeFan2Buf[0]; globalWorksizeFan2[1]=(int)ceil(globalWorksizeFan2[1]/(double)localWorksizeFan2Buf[1])*localWorksizeFan2Buf[1]; } int t; // 4. Setup and Run kernels for (t=0; t<(size-1); t++) { // kernel args cl_int argchk; argchk = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev); argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev); argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev); argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size); argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t); cl_errChk(argchk,"ERROR in Setting Fan1 kernel args",true); //printf("localWorksizeFan1:%u, globalWorksizeFan1:%u\n", localWorksizeFan1Buf[0], globalWorksizeFan1[0]); #pragma dividend local_work_group_size localWorksizeFan1 dim 1 dim1(2:64:2:64) //This lws will be used to profile the OpenCL kernel with id 1 size_t _dividend_lws_localWorksizeFan1_k1[2]; { _dividend_lws_localWorksizeFan1_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_localWorksizeFan1_k1[1] = 1; } // launch kernel error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)( command_queue, fan1_kernel, 1, 0, globalWorksizeFan1, _dividend_lws_localWorksizeFan1_k1, 0, NULL, NULL); cl_errChk(error,"ERROR in Executing Fan1 Kernel",true); //fprintf(stderr, "AFTER THIS\n"); argchk = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev); argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev); argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev); argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size); argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t); cl_errChk(argchk,"ERROR in Setting Fan2 kernel args",true); size_t local_work_size[] = {128, 128}; //printf("localWorksizeFan2:%u, globalWorksizeFan2[0]:%u, globalWorksizeFan2[1]:%u\n", localWorksizeFan2Buf[0], globalWorksizeFan2[0], globalWorksizeFan2[1]); #pragma dividend local_work_group_size local_work_size dim 2 dim1(8:64:2:64) dim2(8:64:2:64) //This lws will be used to profile the OpenCL kernel with id 2 size_t _dividend_lws_local_work_size_k2[3]; { _dividend_lws_local_work_size_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL); _dividend_lws_local_work_size_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_local_work_size_k2[2] = 2; } // launch kernel error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)( command_queue, fan2_kernel, 2, 0, globalWorksizeFan2, _dividend_lws_local_work_size_k2, 0, NULL, NULL); cl_errChk(error,"ERROR in Executing Fan2 Kernel",true); if (timing) { // printf("here2a\n"); // kernelTime+=eventTime(kernelEvent,command_queue); // printf("here2b\n"); } clReleaseEvent(kernelEvent); //Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t); //cudaThreadSynchronize(); } // 5. transfer data off of device error = clEnqueueReadBuffer(command_queue, a_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, a, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); error = clEnqueueReadBuffer(command_queue, b_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size, b, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); error = clEnqueueReadBuffer(command_queue, m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, m, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6); if (timing) { printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s) [size]\t\tTotal(s)\n"); printf("%dx%d \t",size,size); printf("%f [%.2fMB]\t",writeTime,writeMB); printf("%f\t",kernelTime); printf("%f [%.2fMB]\t",readTime,readMB); printf("%f\n\n",writeTime+kernelTime+readTime); } }