int main (int argc, char* argv[]) { struct timespec start, end; initOpenCL(); /* START Measurements */ get_date(argc, argv); clock_gettime(CLOCK_MONOTONIC, &start); int arraySize = SIZE; size_t bufferSize = arraySize * sizeof(double); double* inputA = (double*) malloc (bufferSize); double* inputB = (double*) malloc (bufferSize); double* output = (double*) malloc (bufferSize); /* Initilise data */ initialize_data(arraySize, inputA, inputB); /* Computation */ vector_sum(arraySize, inputA, inputB, output); /* END Measurements */ clock_gettime(CLOCK_MONOTONIC, &end); get_date(argc, argv); /* Check results */ //check_results(arraySize, inputA, inputB, output); /* Cleaning */ cleanUpOpenCL(); double res=0; int i; for (i = 0; i < arraySize; i++) { res = res + output[i] ; } times = (((double)(end.tv_sec - start.tv_sec)*1000) + ((double)(end.tv_nsec - start.tv_nsec)/1000000)); //cout << "Time to finish: " << times << " ms" << endl; printf("Time to finish %6.0f ms [check = %e]\n", times, res); }
int CVC_cl::buildCV(const Mat& lImg, const Mat& rImg, cl_mem *memoryObjects) { lImgRGB = new Mat[lImg.channels()]; rImgRGB = new Mat[rImg.channels()]; split(lImg, lImgRGB); split(rImg, rImgRGB); cvtColor(lImg, lGray, CV_RGB2GRAY); cvtColor(rImg, rGray, CV_RGB2GRAY); /* Map the input memory objects to host side pointers. */ bool EnqueueMapBufferSuccess = true; // if(imgType == CV_32F) // { //Sobel filter to compute X gradient <-- investigate Mali Sobel OpenCL kernel Sobel( lGray, lGrdX, CV_32F, 1, 0, 1 ); // ex time 16 -17ms Sobel( rGray, rGrdX, CV_32F, 1, 0, 1 ); // for both lGrdX += 0.5; rGrdX += 0.5; cl_float *clbuffer_lImgRGB[3], *clbuffer_rImgRGB[3]; for (int i = 0; i < channels; i++) { clbuffer_lImgRGB[i] = (cl_float*)clEnqueueMapBuffer(*commandQueue, memoryObjects[i], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); EnqueueMapBufferSuccess &= checkSuccess(errorNumber); clbuffer_rImgRGB[i] = (cl_float*)clEnqueueMapBuffer(*commandQueue, memoryObjects[i+channels], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); EnqueueMapBufferSuccess &= checkSuccess(errorNumber); memcpy(clbuffer_lImgRGB[i], lImgRGB[i].data, bufferSize_2D); memcpy(clbuffer_rImgRGB[i], rImgRGB[i].data, bufferSize_2D); } // } // else if(imgType == CV_8U) // { // //Sobel filter to compute X gradient // Sobel( lGray, lGrdX, CV_8U, 1, 0, 1 ); // Sobel( rGray, rGrdX, CV_8U, 1, 0, 1 ); // lGrdX += 0.5; // rGrdX += 0.5; // // cl_uchar *clbuffer_lImgRGB[3], *clbuffer_rImgRGB[3]; // //Six 1-channel 2D buffers W*H // for (int i = 0; i < channels; i++) // { // clbuffer_lImgRGB[i] = (cl_uchar*)clEnqueueMapBuffer(*commandQueue, memoryObjects[i], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); // EnqueueMapBufferSuccess &= checkSuccess(errorNumber); // clbuffer_rImgRGB[i] = (cl_uchar*)clEnqueueMapBuffer(*commandQueue, memoryObjects[i+channels], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); // EnqueueMapBufferSuccess &= checkSuccess(errorNumber); // // memcpy(clbuffer_lImgRGB[i], lImgRGB[i].data, bufferSize_2D); // memcpy(clbuffer_rImgRGB[i], rImgRGB[i].data, bufferSize_2D); // } // } //Two 1-channel 2D buffers W*H cl_uchar *clbuffer_lGrdX = (cl_uchar*)clEnqueueMapBuffer(*commandQueue, memoryObjects[CVC_LGRDX], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); EnqueueMapBufferSuccess &= checkSuccess(errorNumber); cl_uchar *clbuffer_rGrdX = (cl_uchar*)clEnqueueMapBuffer(*commandQueue, memoryObjects[CVC_RGRDX], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); EnqueueMapBufferSuccess &= checkSuccess(errorNumber); if (!EnqueueMapBufferSuccess) { cleanUpOpenCL(NULL, NULL, program, NULL, NULL, 0); cerr << "Mapping memory objects failed " << __FILE__ << ":"<< __LINE__ << endl; } //printf("CVC_cl: Copying data to OpenCL memory space\n"); memcpy(clbuffer_lGrdX, lGrdX.data, bufferSize_2D); memcpy(clbuffer_rGrdX, rGrdX.data, bufferSize_2D); int arg_num = 0; /* Setup the kernel arguments. */ bool setKernelArgumentsSuccess = true; setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_LIMGR])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_LIMGG])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_LIMGB])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_RIMGR])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_RIMGG])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_RIMGB])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_LGRDX])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_RGRDX])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_int), &height)); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_int), &width)); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CV_LCV])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CV_RCV])); if (!setKernelArgumentsSuccess) { cleanUpOpenCL(NULL, NULL, NULL, NULL, NULL, 0); cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ << ":"<< __LINE__ << endl; } if(OCL_STATS) printf("CVC_cl: Running CVC Kernels\n"); /* Enqueue the kernel */ if (!checkSuccess(clEnqueueNDRangeKernel(*commandQueue, kernel, 3, NULL, globalWorksize, NULL, 0, NULL, &event))) { cleanUpOpenCL(NULL, NULL, NULL, NULL, NULL, 0); cerr << "Failed enqueuing the kernel. " << __FILE__ << ":"<< __LINE__ << endl; return 1; } /* Wait for completion */ if (!checkSuccess(clFinish(*commandQueue))) { cleanUpOpenCL(NULL, NULL, NULL, NULL, NULL, 0); cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << endl; return 1; } /* Print the profiling information for the event. */ if(OCL_STATS) printProfilingInfo(event); /* Release the event object. */ if (!checkSuccess(clReleaseEvent(event))) { cleanUpOpenCL(*context, *commandQueue, program, NULL, NULL, 0); cerr << "Failed releasing the event object. " << __FILE__ << ":"<< __LINE__ << endl; return 1; } return 0; }
CVC_cl::~CVC_cl(void) { /* Release OpenCL objects. */ cleanUpOpenCL(NULL, NULL, NULL, kernel, NULL, 0); }
CVC_cl::CVC_cl(cl_context* context, cl_command_queue* commandQueue, cl_device_id device, Mat* I, const int d) : maxDis(d), context(context), commandQueue(commandQueue) { //printf("OpenCL Colours and Gradients method for Cost Computation\n"); //OpenCL Setup program = 0; kernel = 0; // imgType = I->type() & CV_MAT_DEPTH_MASK; if (!createProgram(*context, device, FILE_CVC_PROG, &program)) { cleanUpOpenCL(NULL, NULL, program, NULL, NULL, 0); cerr << "Failed to create OpenCL program." << __FILE__ << ":"<< __LINE__ << endl; } width = (cl_int)I->cols; height = (cl_int)I->rows; // channels = (cl_int)I->channels(); // if(imgType == CV_32F) // { strcpy(kernel_name, "cvc_float_nv"); //strcpy(kernel_name, "cvc_float_v4"); bufferSize_2D = width * height * sizeof(cl_float); bufferSize_3D = width * height * maxDis * sizeof(cl_float); //cvc_uchar_nv globalWorksize[0] = (size_t)width; //cvc_uchar_v4 // globalWorksize[0] = (size_t)width/4; globalWorksize[1] = (size_t)height; globalWorksize[2] = (size_t)maxDis; // } // else if(imgType == CV_8U) // { // strcpy(kernel_name, "cvc_uchar_vx"); // //strcpy(kernel_name, "cvc_uchar_v16"); // //strcpy(kernel_name, "cvc_uchar_nv"); // // bufferSize_2D = width * height * sizeof(cl_uchar); // bufferSize_3D = width * height * maxDis * sizeof(cl_uchar); // // //cvc_uchar_vx // globalWorksize[0] = (size_t)height; // globalWorksize[1] = (size_t)1; // //cvc_uchar_v16 //// globalWorksize[0] = (size_t)width/16; // //cvc_uchar_nv //// globalWorksize[0] = (size_t)width; //// globalWorksize[1] = (size_t)height; // // globalWorksize[2] = (size_t)maxDis; // // } // else{ // printf("CVC_cl: Error - Unrecognised data type in processing! (CVC_cl)\n"); // exit(1); // } kernel = clCreateKernel(program, kernel_name, &errorNumber); if (!checkSuccess(errorNumber)) { cleanUpOpenCL(NULL, NULL, NULL, NULL, NULL, 0); cerr << "Failed to create OpenCL kernel. " << __FILE__ << ":"<< __LINE__ << endl; exit(1); } else{ printf("CVC_cl: OpenCL kernels created.\n"); } /* An event to associate with the Kernel. Allows us to retreive profiling information later. */ event = 0; }
float sgemmMain(int rowa,int cola,int colb) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; const unsigned int numberOfMemoryObjects = 3; cl_mem memoryObjectsa = 0; cl_mem memoryObjectsb = 0; cl_mem memoryObjectsc = 0; cl_int errorNumber; cl_uint clrowa = rowa; cl_uint clcola = cola; cl_uint clcolb = colb; int err; err = createContext(&context); LOGD("create context"); err = createCommandQueue(context, &commandQueue, &device); err = createProgram(context, device, "/mnt/sdcard/kernel/sgemm.cl", &program); kernel = clCreateKernel(program, "sgemm", &errorNumber); LOGD("createKernel code %d",errorNumber); LOGD("start computing"); float alpha = 1; float beta = 0.1; /* Create the matrices. */ size_t matrixSizea = rowa * cola; size_t matrixSizeb = cola * colb; size_t matrixSizec = rowa * colb; /* As all the matrices have the same size, the buffer size is common. */ size_t bufferSizea = matrixSizea * sizeof(float); size_t bufferSizeb = matrixSizeb * sizeof(float); size_t bufferSizec = matrixSizec * sizeof(float); /* Create buffers for the matrices used in the kernel. */ int createMemoryObjectsSuccess = 0; memoryObjectsa = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizea, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsb = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizeb, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizec, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; LOGD("create memory err %d",createMemoryObjectsSuccess); int mapMemoryObjectsSuccess = 0; cl_float* matrixA = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsa, CL_TRUE, CL_MAP_WRITE, 0, bufferSizea, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixB = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsb, CL_TRUE, CL_MAP_WRITE, 0, bufferSizeb, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_WRITE, 0, bufferSizec, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; LOGD("map memory err %d",mapMemoryObjectsSuccess); sgemmInitialize(rowa,cola,colb, matrixA, matrixB, matrixC); LOGD("data initial finish"); int unmapMemoryObjectsSuccess = 0; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsa, matrixA, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsb, matrixB, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; LOGD("unmap memory err %d",unmapMemoryObjectsSuccess); int setKernelArgumentsSuccess = 0; errorNumber = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjectsa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjectsb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjectsc); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 3, sizeof(cl_uint), &clrowa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 4, sizeof(cl_uint), &clcola); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 5, sizeof(cl_uint), &clcolb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 6, sizeof(cl_float), &alpha); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 7, sizeof(cl_float), &beta); setKernelArgumentsSuccess &= errorNumber; LOGD("setKernel err %d",setKernelArgumentsSuccess); LOGD("start running kernel"); clock_t start_t,end_t; float cost_time; start_t = clock(); cl_event event = 0; size_t globalWorksize[2] = {rowa, colb}; errorNumber = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorksize, NULL, 0, NULL, &event); //LOGD("Enqueue err code %d",errorNumber); errorNumber = clFinish(commandQueue); end_t = clock(); cost_time = (float)(end_t-start_t)/CLOCKS_PER_SEC*1000; LOGD("Finish err code %d",errorNumber); float time; time = printProfilingInfo(event); LOGT("using CPU clock: %f ms",cost_time); LOGT("using GPU clock: %f ms",time); clReleaseEvent(event); matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_READ, 0, bufferSizec, 0, NULL, NULL, &errorNumber); clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("read out matrixC finish"); LOGD("matrixC value C(0,0): %f",matrixC[0]); cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjectsa, memoryObjectsb,memoryObjectsc,numberOfMemoryObjects); LOGD("RUNNING finsh"); return time; }
inline void vector_sum(const int arraySize, const double* inputA, const double* inputB, double* output) { /* Allocate memory buffers */ /* * Ask the OpenCL implementation to allocate buffers for the data. * We ask the OpenCL implemenation to allocate memory rather than * allocating it on the CPU to avoid having to copy the data later. * The read/write flags relate to accesses to the memory from within * the kernel. */ bool createMemoryObjectSuccess = true; int numberOfMemoryObjects = 3; cl_mem memoryObjects[3] = {0, 0, 0}; int errorNumber = 0; int bufferSize = arraySize*sizeof(double); memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, (void*)inputA, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 1."); memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, (void*)inputB, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 2."); memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, output, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 3."); /* Enqueue commands and kernels */ /* Enqueue to the command queues the commands that control the sequence * and synchronization of kernel execution, reading and writing of data, * and manipulation of memory objects */ /* Execute a kernel function */ /* Call clSetKernelArg() for each parameter in the kernel */ bool setKernelArgumentsSuccess = true; setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2])); if (not setKernelArgumentsSuccess) { cleanUpOpenCL(); std::cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ << ":"<< __LINE__ << std::endl; exit(1); } /* Determine the work-group size and index space for the kernel */ const size_t globalWorkSize[1] = {arraySize}; const size_t localWorkSize[1] = { 1 }; /* Enqueue the kernel for execution in the command queue */ //for (int j = 0; j < ITER; j++) { if (not checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))) { cleanUpOpenCL(); std::cerr << "Failed enqueuing the kernel. " << __FILE__ << ":" << __LINE__ <<std::endl; exit(1); } //} /* Get a pointer to the output data */ output = (double*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, arraySize, 0, NULL, NULL, &errorNumber); if (not checkSuccess(errorNumber)) { cleanUpOpenCL(); std::cerr << "Failed to map buffer " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } /* Wait for kernel execution */ if (not checkSuccess(clFinish(commandQueue))) { cleanUpOpenCL(); std::cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << std::endl; exit(1); } /* Unmap the memory objects as we finished using them in the CPU */ if (not checkSuccess(clReleaseMemObject(memoryObjects[0]))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } if (not checkSuccess(clReleaseMemObject(memoryObjects[1]))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } if (not checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } }