MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) { /* register several types in order to use it for qt signals/slots */ qRegisterMetaType<cv::Mat>("cv::Mat"); qRegisterMetaType< std::vector<cv::Mat> >("std::vector<cv::Mat>"); /* setup camera */ camera = new Camera(); bool camFound = camera->open(0); if (!camFound) { /* no camera, forcing test mode */ camera->setTestMode(true); } else { /* reset camera registers and start led ringlight */ camera->reset(); camera->printStatus(); } camThread = new QThread; camera->moveToThread(camThread); /* creating photometric stereo process */ ps = new PhotometricStereo(camera->width, camera->height, camera->avgImageIntensity()); /* setup ui */ setWindowTitle("Realtime Photometric-Stereo"); createInterface(); statusBar()->setStyleSheet("font-size:12px;font-weight:bold;"); /* connecting camera with attached thread */ connect(camThread, SIGNAL(started()), camera, SLOT(start())); connect(camera, SIGNAL(stopped()), camThread, SLOT(quit())); connect(camera, SIGNAL(stopped()), camera, SLOT(deleteLater())); connect(camThread, SIGNAL(finished()), camThread, SLOT(deleteLater())); /* connecting camera with camerawidget and ps process */ connect(camera, SIGNAL(newCamFrame(cv::Mat)), camWidget, SLOT(setImage(cv::Mat)), Qt::AutoConnection); /* invoking ps setImage slot immediately, when the signal is emitted to ensure image order */ connect(camera, SIGNAL(newCroppedFrame(cv::Mat)), ps, SLOT(setImage(cv::Mat)), Qt::DirectConnection); /* connecting ps process with mainwindow and modelwidget */ connect(ps, SIGNAL(executionTime(QString)), this, SLOT(setStatusMessage(QString)), Qt::AutoConnection); connect(ps, SIGNAL(modelFinished(std::vector<cv::Mat>)), this, SLOT(onModelFinished(std::vector<cv::Mat>)), Qt::AutoConnection); /* start camera in separate thread with high priority */ camThread->start(); camThread->setPriority(QThread::TimeCriticalPriority); }
void menu() { //to use random values srand(time(NULL)); clock_t time_start, time_end; int menu, status = 0, position; int noValue, n_elem, i, j; int pai, qt_filhos, filho; pTREE pRoot; pTREE pNewTree; /* 2.) A equipe deverá montar uma interface simples de entrada de dados ( nós e arestas da árvore; tipo de percurso) bem como um relatório com as saídas dos resultados. A árvore deverá ser qualquer */ do { printf("\n[0] Criar nova arvore"); printf("\n[1] Percurso em Pre-Ordem"); printf("\n[2] Percurso Em-Ordem "); printf("\n[3] Percurso em Pos-Ordem"); printf("\n"); scanf("%d", &menu); switch (menu) { case 0: printf("\n Quantas entradas:"); printf("\n"); scanf("%d", &n_elem); pRoot = malloc(sizeof (TREE)); pRoot->ppFilhos = malloc(sizeof (pTREE)); *(pRoot)->ppFilhos = malloc(sizeof (pTREE) * n_elem); printf("\n Qual a raiz:"); printf("\n"); scanf("%d", &noValue); pRoot->data = noValue; for (i = 0; i < n_elem; i++) { printf("Digite o [elemto pai] [quantidade filhos] [cada filho ate [quantidade filhos]] [-1 ou elem inexistente] Para parar\n"); scanf("%d", &pai); //achar pai na arvore, se nao exit //dfs ppTREE ppPai; ppPai = malloc(sizeof (pTREE)); //apontar para o pai que achou scanf("%d", &qt_filhos); *ppPai = malloc(sizeof (TREE) * qt_filhos); pTREE pFilho; for (j = 0; j < qt_filhos; j++) { pFilho = malloc(sizeof (TREE)); scanf("%d", &filho); pFilho->data = filho; memcpy(ppPai[j], pFilho, sizeof (TREE)); } } status = 1; break; case 1: printf("What is the block size?\n"); //scanf("%d",&blockSize); time_start = time(NULL); //readRandomEntriesBlock(blockSize); time_end = time(NULL); executionTime(time_start, time_end); break; case 2: time_start = time(NULL); //generateOneEntry(size,blockSize); time_end = time(NULL); executionTime(time_start, time_end); break; case 3: printf("What position?\n"); scanf("%d", &position); time_start = time(NULL); //readEntryPosition(position); time_end = time(NULL); executionTime(time_start, time_end); break; case 4: /* status = 0; */ /* position = getInsertPosition(); */ /* quantity = getInsertQuantity(); */ /* printEntries(getMultipleEntries(position, quantity), quantity); */ break; default: status = 1; printf("\nPlease, select a valid option!!!"); break; } } while (status == 1); }
void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data, unsigned int mem_size_B, float* h_C ) { cl_mem d_A[MAX_GPU_COUNT]; cl_mem d_C[MAX_GPU_COUNT]; cl_mem d_B[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; // Start the computation on each available GPU // Create buffers for each GPU // Each GPU will compute sizePerGPU rows of the result int sizePerGPU = uiHA / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i=0; i < ciDeviceCount; ++i) { // Input buffer workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (uiHA - workOffset[i]); d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * uiWA, NULL,NULL); // Copy only assigned rows from host to device clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * uiWA, 0, workSize[i] * sizeof(float) * uiWA, 0, NULL, NULL); // create OpenCL buffer on device that will be initiatlize from the host memory on first use // on device d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B_data, NULL); // Output buffer d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, workSize[i] * uiWC * sizeof(float), NULL,NULL); // set the args values clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]); clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]); clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]); clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 5, sizeof(cl_int), (void *) &uiWA); clSetKernelArg(multiplicationKernel[i], 6, sizeof(cl_int), (void *) &uiWB); if(i+1 < ciDeviceCount) workOffset[i + 1] = workOffset[i] + workSize[i]; } // Execute Multiplication on all GPUs in parallel size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, uiWC), shrRoundUp(BLOCK_SIZE, workSize[0])}; // Launch kernels on devices #ifdef GPU_PROFILING int nIter = 30; for (int j = -1; j < nIter; j++) { // Sync all queues to host and start timer first time through loop if(j == 0){ for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } shrDeltaT(0); } #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Multiplication - non-blocking execution: launch and push to device(s) globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]); clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); clFlush(commandQueue[i]); } #ifdef GPU_PROFILING } #endif // sync all queues to host for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } #ifdef GPU_PROFILING // stop and log timer double dSeconds = shrDeltaT(0)/(double)nIter; double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB; double gflops = 1.0e-9 * dNumOps/dSeconds; shrLogEx(LOGBOTH | MASTER, 0, "oclMatrixMul, Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Workgroup = %u\n", gflops, dSeconds, dNumOps, ciDeviceCount, localWorkSize[0] * localWorkSize[1]); // Print kernel timing per GPU shrLog("\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { shrLog(" Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i])); } shrLog("\n"); #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Non-blocking copy of result from device to host clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, uiWC * sizeof(float) * workSize[i], h_C + workOffset[i] * uiWC, 0, NULL, &GPUDone[i]); } // CPU sync with GPU clWaitForEvents(ciDeviceCount, GPUDone); // Release mem and event objects for(unsigned int i = 0; i < ciDeviceCount; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseMemObject(d_B[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("oclSimpleMultiGPU.txt"); shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); // OpenCL cl_platform_id cpPlatform; cl_uint ciDeviceCount; cl_device_id* cdDevices; cl_context cxGPUContext; cl_device_id cdDevice; // GPU device int deviceNr[MAX_GPU_COUNT]; cl_command_queue commandQueue[MAX_GPU_COUNT]; cl_mem d_Data[MAX_GPU_COUNT]; cl_mem d_Result[MAX_GPU_COUNT]; cl_program cpProgram; cl_kernel reduceKernel[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; size_t programLength; cl_int ciErrNum; char cDeviceName [256]; cl_mem h_DataBuffer; // Vars for reduction results float h_SumGPU[MAX_GPU_COUNT * ACCUM_N]; float *h_Data; double sumGPU; double sumCPU, dRelError; // allocate and init host buffer with with some random generated input data h_Data = (float *)malloc(DATA_N * sizeof(float)); shrFillArray(h_Data, DATA_N); // start timer & logs shrLog("Setting up OpenCL on the Host...\n\n"); shrDeltaT(1); // Annotate profiling state #ifdef GPU_PROFILING shrLog("OpenCL Profiling is enabled...\n\n"); #endif //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetPlatformID...\n"); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetDeviceIDs...\n"); //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateContext...\n"); // Set up command queue(s) for GPU's specified on the command line or all GPU's if(shrCheckCmdLineFlag(argc, (const char **)argv, "device")) { // User specified GPUs int ciMaxDeviceID = ciDeviceCount-1; ciDeviceCount = 0; char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif // Create command queues for all Requested GPU's while(deviceStr != NULL) { // get & log device index # and name deviceNr[ciDeviceCount] = atoi(deviceStr); if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) { shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]); return 1; } cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName); // create a command que commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n"); ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); oclCheckError(ciErrNum, CL_SUCCESS); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { // get & log device index # and name deviceNr[i] = i; cdDevice = oclGetDev(cxGPUContext, i); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n", i, cDeviceName); // create a command que commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n\n"); } } // Load the OpenCL source code from the .cl file const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]); char *source = oclLoadProgSource(source_path, "", &programLength); oclCheckError(source != NULL, shrTRUE); shrLog("oclLoadProgSource\n"); // Create the program for all GPUs in the context cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // Create host buffer with page-locked memory h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, DATA_N * sizeof(float), h_Data, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Page-locked Host)\n\n"); // Create buffers for each GPU, with data divided evenly among GPU's int sizePerGPU = DATA_N / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i = 0; i < ciDeviceCount; ++i ) { workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]); // Input buffer d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); // Copy data from host to device ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 0, workSize[i] * sizeof(float), 0, NULL, NULL); shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i); // Output buffer d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Output)\t\tDev %i\n", i); // Create kernel reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\t\t\tDev %i\n", i); // Set the args values and check for errors ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clSetKernelArg\t\t\tDev %i\n\n", i); workOffset[i + 1] = workOffset[i] + workSize[i]; } // Set # of work items in work group and total in 1 dimensional range size_t localWorkSize[] = {THREAD_N}; size_t globalWorkSize[] = {ACCUM_N}; // Start timer and launch reduction kernel on each GPU, with data split between them shrLog("Launching Kernels on GPU(s)...\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Copy result from device to host for each device for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), h_SumGPU + i * ACCUM_N, 0, NULL, &GPUDone[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Synchronize with the GPUs and do accumulated error check clWaitForEvents(ciDeviceCount, GPUDone); shrLog("clWaitForEvents complete...\n\n"); // Aggregate results for multiple GPU's and stop/log processing time sumGPU = 0; for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++) { sumGPU += h_SumGPU[i]; } // Print Execution Times for each GPU #ifdef GPU_PROFILING shrLog("Profiling Information for GPU Processing:\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { cdDevice = oclGetDev(cxGPUContext, deviceNr[i]); clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); shrLog("Device %i : %s\n", deviceNr[i], cDeviceName); shrLog(" Reduce Kernel : %.5f s\n", executionTime(GPUExecution[i])); shrLog(" Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i])); } #endif // Run the computation on the Host CPU and log processing time shrLog("Launching Host/CPU C++ Computation...\n\n"); sumCPU = 0; for(unsigned int i = 0; i < DATA_N; i++) { sumCPU += h_Data[i]; } // Check GPU result against CPU result dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU); shrLog("Comparing against Host/C++ computation...\n"); shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError); // cleanup free(source); free(h_Data); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { clReleaseKernel(reduceKernel[i]); clReleaseCommandQueue(commandQueue[i]); } clReleaseProgram(cpProgram); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED); }
int main(int argc, char *argv[]){ // check commandline parameters if (argc < 3) { fprintf(stderr, "Usage: %s [kernel] [length of vector] [dim]\n", argv[0]); exit(1); } cl_int errorCode; cl_device_type deviceType = CL_DEVICE_TYPE_CPU; cl_device_id * devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; char *kernelfile = argv[1]; int length = atoi(argv[2]); int dim = atoi(argv[3]); assert(initialization( deviceType, devices, &context, &cmdQueue, &program, kernelfile)); float *X = (float*) malloc(sizeof(float)*length); float *Y = (float*) malloc(sizeof(float)*length); float *Z = (float*) malloc(sizeof(float)*length); for (int i = 0; i < length; i++) { X[i] = (float)i + 0.1; Y[i] = (float)i + 0.2; Z[i] = 0.0; } cl_mem X_mem, Y_mem, Z_mem; ALLOCATE_GPU_READ(X_mem, X, sizeof(float)*length); ALLOCATE_GPU_READ(Y_mem, Y, sizeof(float)*length); ALLOCATE_GPU_READ_WRITE_INIT(Z_mem, Z, sizeof(float)*length); size_t globalSize[1] = {length / dim}; size_t localSize[1] = {1}; float alpha = 0.2; cl_kernel kernel = clCreateKernel(program, "saxpy_naive", &errorCode); CHECKERROR; errorCode = clSetKernelArg(kernel, 0, sizeof(cl_mem), &X_mem); CHECKERROR; errorCode = clSetKernelArg(kernel, 1, sizeof(cl_mem), &Y_mem); CHECKERROR; errorCode = clSetKernelArg(kernel, 2, sizeof(cl_mem), &Z_mem); CHECKERROR; errorCode = clSetKernelArg(kernel, 3, sizeof(cl_float), &alpha); CHECKERROR; errorCode = clSetKernelArg(kernel, 4, sizeof(cl_int), &dim); CHECKERROR; errorCode = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalSize, localSize, 0, NULL, NULL); CHECKERROR; printf("Start to Run ...\n"); cl_event runEvent; errorCode = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalSize, localSize, 0, NULL, &runEvent); CHECKERROR; errorCode = clFinish(cmdQueue); printf("Execution Time: %.2fns\n", executionTime(runEvent) / length * 1e9); printf("Start to Readback ...\n"); errorCode = clEnqueueReadBuffer(cmdQueue, Z_mem, CL_TRUE, 0, sizeof(float)*length, Z, 0, NULL, NULL); CHECKERROR; printf("Checking Correctness ...\n"); for (int i = 0; i < length; i++) { float res = X[i] * alpha + Y[i]; float ans = Z[i]; if (res - ans > 1E-4 || res - ans < -1E-4) { printf("%.10f %.10f %.10f\n", res, ans, res-ans); fprintf(stderr, "ERROR!"); exit(1); } } printf("OK\n"); return 0; }
void PhotometricStereo::execute() { /* measuring ps performance */ long start = getMilliSecs(); /* creating OpenCL buffers */ size_t imgSize3 = sizeof(float) * (height*width*3); size_t gradSize = sizeof(float) * (height*width); size_t sSize = sizeof(float) * (lightSrcsInv.rows*lightSrcsInv.cols*lightSrcsInv.channels()); cl::ImageFormat imgFormat = cl::ImageFormat(CL_INTENSITY, CL_UNORM_INT8); cl_img1 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error); cl_img2 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error); cl_img3 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error); cl_img4 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error); cl_img5 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error); cl_img6 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error); cl_img7 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error); cl_img8 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error); cl_Sinv = cl::Buffer(context, CL_MEM_READ_ONLY, sSize, NULL, &error); cl_Pgrads = cl::Buffer(context, CL_MEM_WRITE_ONLY, gradSize, NULL, &error); cl_Qgrads = cl::Buffer(context, CL_MEM_WRITE_ONLY, gradSize, NULL, &error); cl_N = cl::Buffer(context, CL_MEM_WRITE_ONLY, imgSize3, NULL, &error); /* pushing data to CPU */ cv::Mat Normals(height, width, CV_32FC3, cv::Scalar::all(0)); cv::Mat Pgrads(height, width, CV_32F, cv::Scalar::all(0)); cv::Mat Qgrads(height, width, CV_32F, cv::Scalar::all(0)); cl::size_t<3> origin; origin[0] = 0; origin[1] = 0; origin[2] = 0; cl::size_t<3> region; region[0] = width; region[1] = height; region[2] = 1; mutex.lock(); queue.enqueueWriteImage(cl_img1, CL_TRUE, origin, region, 0, 0, psImages.at(0).data); queue.enqueueWriteImage(cl_img2, CL_TRUE, origin, region, 0, 0, psImages.at(1).data); queue.enqueueWriteImage(cl_img3, CL_TRUE, origin, region, 0, 0, psImages.at(2).data); queue.enqueueWriteImage(cl_img4, CL_TRUE, origin, region, 0, 0, psImages.at(3).data); queue.enqueueWriteImage(cl_img5, CL_TRUE, origin, region, 0, 0, psImages.at(4).data); queue.enqueueWriteImage(cl_img6, CL_TRUE, origin, region, 0, 0, psImages.at(5).data); queue.enqueueWriteImage(cl_img7, CL_TRUE, origin, region, 0, 0, psImages.at(6).data); queue.enqueueWriteImage(cl_img8, CL_TRUE, origin, region, 0, 0, psImages.at(7).data); mutex.unlock(); queue.enqueueWriteBuffer(cl_Sinv, CL_TRUE, 0, sSize, lightSrcsInv.data, NULL, &event); queue.enqueueWriteBuffer(cl_Pgrads, CL_TRUE, 0, gradSize, Pgrads.data, NULL, &event); queue.enqueueWriteBuffer(cl_Qgrads, CL_TRUE, 0, gradSize, Qgrads.data, NULL, &event); queue.enqueueWriteBuffer(cl_N, CL_TRUE, 0, imgSize3, Normals.data, NULL, &event); /* set kernel arguments */ calcNormKernel.setArg(0, cl_img1); // 1-8 images calcNormKernel.setArg(1, cl_img2); calcNormKernel.setArg(2, cl_img3); calcNormKernel.setArg(3, cl_img4); calcNormKernel.setArg(4, cl_img5); calcNormKernel.setArg(5, cl_img6); calcNormKernel.setArg(6, cl_img7); calcNormKernel.setArg(7, cl_img8); calcNormKernel.setArg(8, width); // required for.. calcNormKernel.setArg(9, height); // ..determining array dimensions calcNormKernel.setArg(10, cl_Sinv); // inverse of light matrix calcNormKernel.setArg(11, cl_Pgrads); // P gradients calcNormKernel.setArg(12, cl_Qgrads); // Q gradients calcNormKernel.setArg(13, cl_N); // normals for each point calcNormKernel.setArg(14, maxpq); // max depth gradients as in [Wei2001] calcNormKernel.setArg(15, minIntensity); // exaggerate slope as in [Malzbender2006] /* wait for command queue to finish before continuing */ queue.finish(); /* executing kernel */ queue.enqueueNDRangeKernel(calcNormKernel, cl::NullRange, cl::NDRange(height, width), cl::NullRange, NULL, &event); queue.finish(); /* reading back from CPU device */ queue.enqueueReadBuffer(cl_Pgrads, CL_TRUE, 0, gradSize, Pgrads.data); queue.enqueueReadBuffer(cl_Qgrads, CL_TRUE, 0, gradSize, Qgrads.data); queue.enqueueReadBuffer(cl_N, CL_TRUE, 0, sizeof(float) * (height*width*3), Normals.data); /* integrate and get heights globally */ cv::Mat Zcoords = getGlobalHeights(Pgrads, Qgrads); /* pushing normals to CPU again */ cl_N = cl::Buffer(context, CL_MEM_READ_WRITE, imgSize3, NULL, &error); queue.enqueueWriteBuffer(cl_N, CL_TRUE, 0, imgSize3, Normals.data); /* unsharp masking as in [Malzbender2006] */ updateNormKernel.setArg(0, cl_N); updateNormKernel.setArg(1, width); updateNormKernel.setArg(2, height); updateNormKernel.setArg(3, cl_Pgrads); updateNormKernel.setArg(4, cl_Qgrads); updateNormKernel.setArg(5, unsharpScaleFactor); /* executing kernel updating normals */ queue.enqueueNDRangeKernel(updateNormKernel, cl::NullRange, cl::NDRange(height, width), cl::NullRange, NULL, &event); queue.finish(); /* reading back from CPU device */ queue.enqueueReadBuffer(cl_Pgrads, CL_TRUE, 0, gradSize, Pgrads.data); queue.enqueueReadBuffer(cl_Qgrads, CL_TRUE, 0, gradSize, Qgrads.data); queue.enqueueReadBuffer(cl_N, CL_TRUE, 0, imgSize3, Normals.data); /* integrate updated gradients second time */ Zcoords = getGlobalHeights(Pgrads, Qgrads); /* store 3d data and normals tensor-like */ std::vector<cv::Mat> matVec; matVec.push_back(XCoords); matVec.push_back(YCoords); matVec.push_back(Zcoords); matVec.push_back(Normals); emit executionTime("Elapsed time: " + QString::number(getMilliSecs() - start) + " ms."); emit modelFinished(matVec); }