int CVC_cl::buildCV(const Mat& lImg, const Mat& rImg, cl_mem *memoryObjects) { lImgRGB = new Mat[lImg.channels()]; rImgRGB = new Mat[rImg.channels()]; split(lImg, lImgRGB); split(rImg, rImgRGB); cvtColor(lImg, lGray, CV_RGB2GRAY); cvtColor(rImg, rGray, CV_RGB2GRAY); /* Map the input memory objects to host side pointers. */ bool EnqueueMapBufferSuccess = true; // if(imgType == CV_32F) // { //Sobel filter to compute X gradient <-- investigate Mali Sobel OpenCL kernel Sobel( lGray, lGrdX, CV_32F, 1, 0, 1 ); // ex time 16 -17ms Sobel( rGray, rGrdX, CV_32F, 1, 0, 1 ); // for both lGrdX += 0.5; rGrdX += 0.5; cl_float *clbuffer_lImgRGB[3], *clbuffer_rImgRGB[3]; for (int i = 0; i < channels; i++) { clbuffer_lImgRGB[i] = (cl_float*)clEnqueueMapBuffer(*commandQueue, memoryObjects[i], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); EnqueueMapBufferSuccess &= checkSuccess(errorNumber); clbuffer_rImgRGB[i] = (cl_float*)clEnqueueMapBuffer(*commandQueue, memoryObjects[i+channels], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); EnqueueMapBufferSuccess &= checkSuccess(errorNumber); memcpy(clbuffer_lImgRGB[i], lImgRGB[i].data, bufferSize_2D); memcpy(clbuffer_rImgRGB[i], rImgRGB[i].data, bufferSize_2D); } // } // else if(imgType == CV_8U) // { // //Sobel filter to compute X gradient // Sobel( lGray, lGrdX, CV_8U, 1, 0, 1 ); // Sobel( rGray, rGrdX, CV_8U, 1, 0, 1 ); // lGrdX += 0.5; // rGrdX += 0.5; // // cl_uchar *clbuffer_lImgRGB[3], *clbuffer_rImgRGB[3]; // //Six 1-channel 2D buffers W*H // for (int i = 0; i < channels; i++) // { // clbuffer_lImgRGB[i] = (cl_uchar*)clEnqueueMapBuffer(*commandQueue, memoryObjects[i], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); // EnqueueMapBufferSuccess &= checkSuccess(errorNumber); // clbuffer_rImgRGB[i] = (cl_uchar*)clEnqueueMapBuffer(*commandQueue, memoryObjects[i+channels], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); // EnqueueMapBufferSuccess &= checkSuccess(errorNumber); // // memcpy(clbuffer_lImgRGB[i], lImgRGB[i].data, bufferSize_2D); // memcpy(clbuffer_rImgRGB[i], rImgRGB[i].data, bufferSize_2D); // } // } //Two 1-channel 2D buffers W*H cl_uchar *clbuffer_lGrdX = (cl_uchar*)clEnqueueMapBuffer(*commandQueue, memoryObjects[CVC_LGRDX], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); EnqueueMapBufferSuccess &= checkSuccess(errorNumber); cl_uchar *clbuffer_rGrdX = (cl_uchar*)clEnqueueMapBuffer(*commandQueue, memoryObjects[CVC_RGRDX], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bufferSize_2D, 0, NULL, NULL, &errorNumber); EnqueueMapBufferSuccess &= checkSuccess(errorNumber); if (!EnqueueMapBufferSuccess) { cleanUpOpenCL(NULL, NULL, program, NULL, NULL, 0); cerr << "Mapping memory objects failed " << __FILE__ << ":"<< __LINE__ << endl; } //printf("CVC_cl: Copying data to OpenCL memory space\n"); memcpy(clbuffer_lGrdX, lGrdX.data, bufferSize_2D); memcpy(clbuffer_rGrdX, rGrdX.data, bufferSize_2D); int arg_num = 0; /* Setup the kernel arguments. */ bool setKernelArgumentsSuccess = true; setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_LIMGR])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_LIMGG])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_LIMGB])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_RIMGR])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_RIMGG])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_RIMGB])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_LGRDX])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CVC_RGRDX])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_int), &height)); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_int), &width)); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CV_LCV])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, arg_num++, sizeof(cl_mem), &memoryObjects[CV_RCV])); if (!setKernelArgumentsSuccess) { cleanUpOpenCL(NULL, NULL, NULL, NULL, NULL, 0); cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ << ":"<< __LINE__ << endl; } if(OCL_STATS) printf("CVC_cl: Running CVC Kernels\n"); /* Enqueue the kernel */ if (!checkSuccess(clEnqueueNDRangeKernel(*commandQueue, kernel, 3, NULL, globalWorksize, NULL, 0, NULL, &event))) { cleanUpOpenCL(NULL, NULL, NULL, NULL, NULL, 0); cerr << "Failed enqueuing the kernel. " << __FILE__ << ":"<< __LINE__ << endl; return 1; } /* Wait for completion */ if (!checkSuccess(clFinish(*commandQueue))) { cleanUpOpenCL(NULL, NULL, NULL, NULL, NULL, 0); cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << endl; return 1; } /* Print the profiling information for the event. */ if(OCL_STATS) printProfilingInfo(event); /* Release the event object. */ if (!checkSuccess(clReleaseEvent(event))) { cleanUpOpenCL(*context, *commandQueue, program, NULL, NULL, 0); cerr << "Failed releasing the event object. " << __FILE__ << ":"<< __LINE__ << endl; return 1; } return 0; }
float sgemmMain(int rowa,int cola,int colb) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; const unsigned int numberOfMemoryObjects = 3; cl_mem memoryObjectsa = 0; cl_mem memoryObjectsb = 0; cl_mem memoryObjectsc = 0; cl_int errorNumber; cl_uint clrowa = rowa; cl_uint clcola = cola; cl_uint clcolb = colb; int err; err = createContext(&context); LOGD("create context"); err = createCommandQueue(context, &commandQueue, &device); err = createProgram(context, device, "/mnt/sdcard/kernel/sgemm.cl", &program); kernel = clCreateKernel(program, "sgemm", &errorNumber); LOGD("createKernel code %d",errorNumber); LOGD("start computing"); float alpha = 1; float beta = 0.1; /* Create the matrices. */ size_t matrixSizea = rowa * cola; size_t matrixSizeb = cola * colb; size_t matrixSizec = rowa * colb; /* As all the matrices have the same size, the buffer size is common. */ size_t bufferSizea = matrixSizea * sizeof(float); size_t bufferSizeb = matrixSizeb * sizeof(float); size_t bufferSizec = matrixSizec * sizeof(float); /* Create buffers for the matrices used in the kernel. */ int createMemoryObjectsSuccess = 0; memoryObjectsa = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizea, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsb = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizeb, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizec, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; LOGD("create memory err %d",createMemoryObjectsSuccess); int mapMemoryObjectsSuccess = 0; cl_float* matrixA = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsa, CL_TRUE, CL_MAP_WRITE, 0, bufferSizea, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixB = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsb, CL_TRUE, CL_MAP_WRITE, 0, bufferSizeb, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_WRITE, 0, bufferSizec, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; LOGD("map memory err %d",mapMemoryObjectsSuccess); sgemmInitialize(rowa,cola,colb, matrixA, matrixB, matrixC); LOGD("data initial finish"); int unmapMemoryObjectsSuccess = 0; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsa, matrixA, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsb, matrixB, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; LOGD("unmap memory err %d",unmapMemoryObjectsSuccess); int setKernelArgumentsSuccess = 0; errorNumber = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjectsa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjectsb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjectsc); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 3, sizeof(cl_uint), &clrowa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 4, sizeof(cl_uint), &clcola); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 5, sizeof(cl_uint), &clcolb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 6, sizeof(cl_float), &alpha); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 7, sizeof(cl_float), &beta); setKernelArgumentsSuccess &= errorNumber; LOGD("setKernel err %d",setKernelArgumentsSuccess); LOGD("start running kernel"); clock_t start_t,end_t; float cost_time; start_t = clock(); cl_event event = 0; size_t globalWorksize[2] = {rowa, colb}; errorNumber = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorksize, NULL, 0, NULL, &event); //LOGD("Enqueue err code %d",errorNumber); errorNumber = clFinish(commandQueue); end_t = clock(); cost_time = (float)(end_t-start_t)/CLOCKS_PER_SEC*1000; LOGD("Finish err code %d",errorNumber); float time; time = printProfilingInfo(event); LOGT("using CPU clock: %f ms",cost_time); LOGT("using GPU clock: %f ms",time); clReleaseEvent(event); matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_READ, 0, bufferSizec, 0, NULL, NULL, &errorNumber); clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("read out matrixC finish"); LOGD("matrixC value C(0,0): %f",matrixC[0]); cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjectsa, memoryObjectsb,memoryObjectsc,numberOfMemoryObjects); LOGD("RUNNING finsh"); return time; }