/*! Prints out the time taken between the start and end of an event.\n Adds synchronization in order to be sure that events have occured otherwise profiling calls will fail \n Shouldnt be used on critical path due to the necessary flushing of the queue \param event_time */ void cl_KernelTimeSync(cl_event event_time) { cl_int kerneltimer; clFlush(cl_getCommandQueue()); clFinish(cl_getCommandQueue()); cl_ulong starttime; cl_ulong endtime; kerneltimer = clGetEventProfilingInfo(event_time, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &starttime, NULL); if(cl_errChk(kerneltimer, "Error in Start Time \n"))exit(1); kerneltimer = clGetEventProfilingInfo(event_time, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endtime, NULL); if(cl_errChk(kerneltimer, "Error in Start Time \n"))exit(1); unsigned long elapsed = (unsigned long)(endtime - starttime); printf("\tTime Elapsed in Kernel is %ld ns\n",elapsed); }
float *OpenClFindNearestNeighbors( cl_context context, int numRecords, std::vector<LatLong> &locations,float lat,float lng, int timing) { // 1. set up kernel cl_kernel NN_kernel; cl_int status; cl_program cl_NN_program; cl_NN_program = cl_compileProgram( (char *)"nearestNeighbor_kernel.cl",NULL); NN_kernel = clCreateKernel( cl_NN_program, "NearestNeighbor", &status); status = cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel",true); if(status)exit(1); // 2. set up memory on device and send ipts data to device // copy ipts(1,2) to device // also need to alloate memory for the distancePoints cl_mem d_locations; cl_mem d_distances; cl_int error=0; d_locations = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(LatLong) * numRecords, NULL, &error); d_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * numRecords, NULL, &error); cl_command_queue command_queue = cl_getCommandQueue(); cl_event writeEvent,kernelEvent,readEvent; error = clEnqueueWriteBuffer(command_queue, d_locations, 1, // change to 0 for nonblocking write 0, // offset sizeof(LatLong) * numRecords, &locations[0], 0, NULL, &writeEvent); // 3. send arguments to device cl_int argchk; argchk = clSetKernelArg(NN_kernel, 0, sizeof(cl_mem), (void *)&d_locations); argchk |= clSetKernelArg(NN_kernel, 1, sizeof(cl_mem), (void *)&d_distances); argchk |= clSetKernelArg(NN_kernel, 2, sizeof(int), (void *)&numRecords); argchk |= clSetKernelArg(NN_kernel, 3, sizeof(float), (void *)&lat); argchk |= clSetKernelArg(NN_kernel, 4, sizeof(float), (void *)&lng); cl_errChk(argchk,"ERROR in Setting Nearest Neighbor kernel args",true); // 4. enqueue kernel size_t globalWorkSize[1]; globalWorkSize[0] = numRecords; if (numRecords % 64) globalWorkSize[0] += 64 - (numRecords % 64); //printf("Global Work Size: %zu\n",globalWorkSize[0]); error = clEnqueueNDRangeKernel( command_queue, NN_kernel, 1, 0, globalWorkSize,NULL, 0, NULL, &kernelEvent); cl_errChk(error,"ERROR in Executing Kernel NearestNeighbor",true); // 5. transfer data off of device // create distances std::vector float *distances = (float *)malloc(sizeof(float) * numRecords); error = clEnqueueReadBuffer(command_queue, d_distances, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * numRecords, distances, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) { clFinish(command_queue); cl_ulong eventStart,eventEnd,totalTime=0; printf("# Records\tWrite(s) [size]\t\tKernel(s)\tRead(s) [size]\t\tTotal(s)\n"); printf("%d \t",numRecords); // Write Buffer error = clGetEventProfilingInfo(writeEvent,CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&eventStart,NULL); cl_errChk(error,"ERROR in Event Profiling (Write Start)",true); error = clGetEventProfilingInfo(writeEvent,CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&eventEnd,NULL); cl_errChk(error,"ERROR in Event Profiling (Write End)",true); printf("%f [%.2fMB]\t",(float)((eventEnd-eventStart)/1e9),(float)((sizeof(LatLong) * numRecords)/1e6)); totalTime += eventEnd-eventStart; // Kernel error = clGetEventProfilingInfo(kernelEvent,CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&eventStart,NULL); cl_errChk(error,"ERROR in Event Profiling (Kernel Start)",true); error = clGetEventProfilingInfo(kernelEvent,CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&eventEnd,NULL); cl_errChk(error,"ERROR in Event Profiling (Kernel End)",true); printf("%f\t",(float)((eventEnd-eventStart)/1e9)); totalTime += eventEnd-eventStart; // Read Buffer error = clGetEventProfilingInfo(readEvent,CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&eventStart,NULL); cl_errChk(error,"ERROR in Event Profiling (Read Start)",true); error = clGetEventProfilingInfo(readEvent,CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&eventEnd,NULL); cl_errChk(error,"ERROR in Event Profiling (Read End)",true); printf("%f [%.2fMB]\t",(float)((eventEnd-eventStart)/1e9),(float)((sizeof(float) * numRecords)/1e6)); totalTime += eventEnd-eventStart; printf("%f\n\n",(float)(totalTime/1e9)); } // 6. return finalized data and release buffers clReleaseMemObject(d_locations); clReleaseMemObject(d_distances); return distances; }
/*------------------------------------------------------ ** ForwardSub() -- Forward substitution of Gaussian ** elimination. **------------------------------------------------------ */ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing){ // 1. set up kernels cl_kernel fan1_kernel,fan2_kernel; cl_int status=0; cl_program gaussianElim_program; cl_event writeEvent,kernelEvent,readEvent; float writeTime=0,readTime=0,kernelTime=0; float writeMB=0,readMB=0; gaussianElim_program = cl_compileProgram( (char *)"gaussianElim_kernels.cl",NULL); fan1_kernel = clCreateKernel( gaussianElim_program, "Fan1", &status); status = cl_errChk(status, (char *)"Error Creating Fan1 kernel",true); if(status)exit(1); fan2_kernel = clCreateKernel( gaussianElim_program, "Fan2", &status); status = cl_errChk(status, (char *)"Error Creating Fan2 kernel",true); if(status)exit(1); // 2. set up memory on device and send ipts data to device cl_mem a_dev, b_dev, m_dev; cl_int error=0; a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*size*size, NULL, &error); b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*size, NULL, &error); m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size * size, NULL, &error); command_queue = cl_getCommandQueue(); error = clEnqueueWriteBuffer(command_queue, a_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size*size, a, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); error = clEnqueueWriteBuffer(command_queue, b_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size, b, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); error = clEnqueueWriteBuffer(command_queue, m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size*size, m, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6); // 3. Determine block sizes size_t globalWorksizeFan1[1]; size_t globalWorksizeFan2[2]; size_t localWorksizeFan1Buf[1]={BLOCK_SIZE_0}; size_t localWorksizeFan2Buf[2]={BLOCK_SIZE_1_X, BLOCK_SIZE_1_Y}; size_t *localWorksizeFan1=NULL; size_t *localWorksizeFan2=NULL; globalWorksizeFan1[0] = size; globalWorksizeFan2[0] = size; globalWorksizeFan2[1] = size; if(localWorksizeFan1Buf[0]){ localWorksizeFan1=localWorksizeFan1Buf; globalWorksizeFan1[0]=(int)ceil(globalWorksizeFan1[0]/(double)localWorksizeFan1Buf[0])*localWorksizeFan1Buf[0]; } if(localWorksizeFan2Buf[0]){ localWorksizeFan2=localWorksizeFan2Buf; globalWorksizeFan2[0]=(int)ceil(globalWorksizeFan2[0]/(double)localWorksizeFan2Buf[0])*localWorksizeFan2Buf[0]; globalWorksizeFan2[1]=(int)ceil(globalWorksizeFan2[1]/(double)localWorksizeFan2Buf[1])*localWorksizeFan2Buf[1]; } int t; // 4. Setup and Run kernels for (t=0; t<(size-1); t++) { // kernel args cl_int argchk; argchk = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev); argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev); argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev); argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size); argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t); cl_errChk(argchk,"ERROR in Setting Fan1 kernel args",true); //printf("localWorksizeFan1:%u, globalWorksizeFan1:%u\n", localWorksizeFan1Buf[0], globalWorksizeFan1[0]); #pragma dividend local_work_group_size localWorksizeFan1 dim 1 dim1(2:64:2:64) //This lws will be used to profile the OpenCL kernel with id 1 size_t _dividend_lws_localWorksizeFan1_k1[2]; { _dividend_lws_localWorksizeFan1_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_localWorksizeFan1_k1[1] = 1; } // launch kernel error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)( command_queue, fan1_kernel, 1, 0, globalWorksizeFan1, _dividend_lws_localWorksizeFan1_k1, 0, NULL, NULL); cl_errChk(error,"ERROR in Executing Fan1 Kernel",true); //fprintf(stderr, "AFTER THIS\n"); argchk = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev); argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev); argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev); argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size); argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t); cl_errChk(argchk,"ERROR in Setting Fan2 kernel args",true); size_t local_work_size[] = {128, 128}; //printf("localWorksizeFan2:%u, globalWorksizeFan2[0]:%u, globalWorksizeFan2[1]:%u\n", localWorksizeFan2Buf[0], globalWorksizeFan2[0], globalWorksizeFan2[1]); #pragma dividend local_work_group_size local_work_size dim 2 dim1(8:64:2:64) dim2(8:64:2:64) //This lws will be used to profile the OpenCL kernel with id 2 size_t _dividend_lws_local_work_size_k2[3]; { _dividend_lws_local_work_size_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL); _dividend_lws_local_work_size_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_local_work_size_k2[2] = 2; } // launch kernel error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)( command_queue, fan2_kernel, 2, 0, globalWorksizeFan2, _dividend_lws_local_work_size_k2, 0, NULL, NULL); cl_errChk(error,"ERROR in Executing Fan2 Kernel",true); if (timing) { // printf("here2a\n"); // kernelTime+=eventTime(kernelEvent,command_queue); // printf("here2b\n"); } clReleaseEvent(kernelEvent); //Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t); //cudaThreadSynchronize(); } // 5. transfer data off of device error = clEnqueueReadBuffer(command_queue, a_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, a, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); error = clEnqueueReadBuffer(command_queue, b_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size, b, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); error = clEnqueueReadBuffer(command_queue, m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, m, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6); if (timing) { printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s) [size]\t\tTotal(s)\n"); printf("%dx%d \t",size,size); printf("%f [%.2fMB]\t",writeTime,writeMB); printf("%f\t",kernelTime); printf("%f [%.2fMB]\t",readTime,readMB); printf("%f\n\n",writeTime+kernelTime+readTime); } }