MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) {

    /* register several types in order to use it for qt signals/slots */
    qRegisterMetaType<cv::Mat>("cv::Mat");
    qRegisterMetaType< std::vector<cv::Mat> >("std::vector<cv::Mat>");

    /* setup camera */
    camera = new Camera();
    bool camFound = camera->open(0);
    if (!camFound) {
        /* no camera, forcing test mode */
        camera->setTestMode(true);
    } else {
        /* reset camera registers and start led ringlight */
        camera->reset();
        camera->printStatus();
    }
    
    camThread = new QThread;
    camera->moveToThread(camThread);
        
    /* creating photometric stereo process */
    ps = new PhotometricStereo(camera->width, camera->height, camera->avgImageIntensity());

    /* setup ui */
    setWindowTitle("Realtime Photometric-Stereo");
    createInterface();
    statusBar()->setStyleSheet("font-size:12px;font-weight:bold;");

    /* connecting camera with attached thread */
    connect(camThread, SIGNAL(started()), camera, SLOT(start()));
    connect(camera, SIGNAL(stopped()), camThread, SLOT(quit()));
    connect(camera, SIGNAL(stopped()), camera, SLOT(deleteLater()));
    connect(camThread, SIGNAL(finished()), camThread, SLOT(deleteLater()));
    
    /* connecting camera with camerawidget and ps process */
    connect(camera, SIGNAL(newCamFrame(cv::Mat)), camWidget, SLOT(setImage(cv::Mat)), Qt::AutoConnection);
    /* invoking ps setImage slot immediately, when the signal is emitted to ensure image order */
    connect(camera, SIGNAL(newCroppedFrame(cv::Mat)), ps, SLOT(setImage(cv::Mat)), Qt::DirectConnection);
    
    /* connecting ps process with mainwindow and modelwidget */
    connect(ps, SIGNAL(executionTime(QString)), this, SLOT(setStatusMessage(QString)), Qt::AutoConnection);
    connect(ps, SIGNAL(modelFinished(std::vector<cv::Mat>)), this, SLOT(onModelFinished(std::vector<cv::Mat>)), Qt::AutoConnection);
    
    /* start camera in separate thread with high priority */
    camThread->start();
    camThread->setPriority(QThread::TimeCriticalPriority);
}
void menu() {
    //to use random values
    srand(time(NULL));
    clock_t time_start, time_end;

    int menu, status = 0, position;
    int noValue, n_elem, i, j;
    int pai, qt_filhos, filho;
    pTREE pRoot;



    pTREE pNewTree;

    /*
    2.) A equipe deverá montar uma interface simples de entrada de dados ( nós e arestas da árvore; tipo de percurso) bem como um relatório com as saídas dos resultados. A árvore deverá ser qualquer
     */

    do {

        printf("\n[0] Criar nova arvore");
        printf("\n[1] Percurso em Pre-Ordem");
        printf("\n[2] Percurso Em-Ordem ");
        printf("\n[3] Percurso em Pos-Ordem");
        printf("\n");
        scanf("%d", &menu);

        switch (menu) {
            case 0:

                printf("\n Quantas entradas:");
                printf("\n");
                scanf("%d", &n_elem);

                pRoot = malloc(sizeof (TREE));
                pRoot->ppFilhos = malloc(sizeof (pTREE));
                *(pRoot)->ppFilhos = malloc(sizeof (pTREE) * n_elem);

                printf("\n Qual a raiz:");
                printf("\n");

                scanf("%d", &noValue);

                pRoot->data = noValue;

                for (i = 0; i < n_elem; i++) {
                    printf("Digite o [elemto pai] [quantidade filhos] [cada filho ate [quantidade filhos]]  [-1 ou elem inexistente] Para parar\n");

                    scanf("%d", &pai);

                    //achar pai na arvore, se nao exit
                    //dfs
                    ppTREE ppPai;
                    ppPai = malloc(sizeof (pTREE));
                    //apontar para o pai que achou

                    scanf("%d", &qt_filhos);

                    *ppPai = malloc(sizeof (TREE) * qt_filhos);
                    pTREE pFilho;
                    for (j = 0; j < qt_filhos; j++) {
                        pFilho = malloc(sizeof (TREE));
                        scanf("%d", &filho);
                        pFilho->data = filho;

                        memcpy(ppPai[j], pFilho, sizeof (TREE));
                    }

                }
                status = 1;
                break;

            case 1:
                printf("What is the block size?\n");
                //scanf("%d",&blockSize);

                time_start = time(NULL);
                //readRandomEntriesBlock(blockSize);
                time_end = time(NULL);

                executionTime(time_start, time_end);
                break;

            case 2:
                time_start = time(NULL);
                //generateOneEntry(size,blockSize);
                time_end = time(NULL);

                executionTime(time_start, time_end);
                break;

            case 3:
                printf("What position?\n");
                scanf("%d", &position);

                time_start = time(NULL);
                //readEntryPosition(position);
                time_end = time(NULL);

                executionTime(time_start, time_end);
                break;



            case 4:
                /* status = 0; */
                /* position = getInsertPosition(); */
                /* quantity = getInsertQuantity(); */
                /* printEntries(getMultipleEntries(position, quantity), quantity); */
                break;

            default:
                status = 1;
                printf("\nPlease, select a valid option!!!");
                break;
        }

    } while (status == 1);
}
Exemple #3
0
void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data, unsigned int mem_size_B, float* h_C )
{
    cl_mem d_A[MAX_GPU_COUNT];
    cl_mem d_C[MAX_GPU_COUNT];
    cl_mem d_B[MAX_GPU_COUNT];

    cl_event GPUDone[MAX_GPU_COUNT];
    cl_event GPUExecution[MAX_GPU_COUNT];

    // Start the computation on each available GPU
    
    // Create buffers for each GPU
    // Each GPU will compute sizePerGPU rows of the result
    int sizePerGPU = uiHA / ciDeviceCount;

    int workOffset[MAX_GPU_COUNT];
    int workSize[MAX_GPU_COUNT];

    workOffset[0] = 0;
    for(unsigned int i=0; i < ciDeviceCount; ++i) 
    {
        // Input buffer
        workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (uiHA - workOffset[i]);        

        d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * uiWA, NULL,NULL);

        // Copy only assigned rows from host to device
        clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * uiWA, 
                            0, workSize[i] * sizeof(float) * uiWA, 0, NULL, NULL);        
        
        // create OpenCL buffer on device that will be initiatlize from the host memory on first use
        // on device
        d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                mem_size_B, h_B_data, NULL);

        // Output buffer
        d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,  workSize[i] * uiWC * sizeof(float), NULL,NULL);
              
        // set the args values
        clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]);
        clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]);
        clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]);
        clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 );
        clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 );
        clSetKernelArg(multiplicationKernel[i], 5, sizeof(cl_int), (void *) &uiWA);
        clSetKernelArg(multiplicationKernel[i], 6, sizeof(cl_int), (void *) &uiWB);

        if(i+1 < ciDeviceCount)
            workOffset[i + 1] = workOffset[i] + workSize[i];
    }
    
    // Execute Multiplication on all GPUs in parallel
    size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
    size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, uiWC), shrRoundUp(BLOCK_SIZE, workSize[0])};
    
    // Launch kernels on devices
#ifdef GPU_PROFILING
	
	int nIter = 30;

    for (int j = -1; j < nIter; j++) 
    {
        // Sync all queues to host and start timer first time through loop
        if(j == 0){
            for(unsigned int i = 0; i < ciDeviceCount; i++) 
            {
                clFinish(commandQueue[i]);
            }

            shrDeltaT(0);
        }
#endif
        for(unsigned int i = 0; i < ciDeviceCount; i++) 
        {
			// Multiplication - non-blocking execution:  launch and push to device(s)
			globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]);
			clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize,
				                   0, NULL, &GPUExecution[i]);
            clFlush(commandQueue[i]);
		}

#ifdef GPU_PROFILING
    }
#endif

    // sync all queues to host
	for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
		clFinish(commandQueue[i]);
	}

#ifdef GPU_PROFILING

    // stop and log timer 
    double dSeconds = shrDeltaT(0)/(double)nIter;
    double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
    double gflops = 1.0e-9 * dNumOps/dSeconds;
    shrLogEx(LOGBOTH | MASTER, 0, "oclMatrixMul, Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Workgroup = %u\n", 
            gflops, dSeconds, dNumOps, ciDeviceCount, localWorkSize[0] * localWorkSize[1]);

    // Print kernel timing per GPU
    shrLog("\n");
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {    
        shrLog("  Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i]));
    }
    shrLog("\n");
#endif

    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {    
        // Non-blocking copy of result from device to host
        clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, uiWC * sizeof(float) * workSize[i], 
                            h_C + workOffset[i] * uiWC, 0, NULL, &GPUDone[i]);
    }

	// CPU sync with GPU
    clWaitForEvents(ciDeviceCount, GPUDone);


    // Release mem and event objects    
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
        clReleaseMemObject(d_A[i]);
        clReleaseMemObject(d_C[i]);
        clReleaseMemObject(d_B[i]);
	    clReleaseEvent(GPUExecution[i]);
	    clReleaseEvent(GPUDone[i]);
    }
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    shrQAStart(argc, argv);
    // start logs 
    shrSetLogFileName ("oclSimpleMultiGPU.txt");
    shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); 

    // OpenCL
    cl_platform_id cpPlatform;
    cl_uint ciDeviceCount;
    cl_device_id* cdDevices;
    cl_context cxGPUContext;
    cl_device_id cdDevice;                          // GPU device
    int deviceNr[MAX_GPU_COUNT];
    cl_command_queue commandQueue[MAX_GPU_COUNT];
    cl_mem d_Data[MAX_GPU_COUNT];
    cl_mem d_Result[MAX_GPU_COUNT];
    cl_program cpProgram; 
    cl_kernel reduceKernel[MAX_GPU_COUNT];
    cl_event GPUDone[MAX_GPU_COUNT];
    cl_event GPUExecution[MAX_GPU_COUNT];
    size_t programLength;
    cl_int ciErrNum;			               
    char cDeviceName [256];
    cl_mem h_DataBuffer;

    // Vars for reduction results
    float h_SumGPU[MAX_GPU_COUNT * ACCUM_N];   
    float *h_Data;
    double sumGPU;
    double sumCPU, dRelError;

    // allocate and init host buffer with with some random generated input data
    h_Data = (float *)malloc(DATA_N * sizeof(float));
    shrFillArray(h_Data, DATA_N);

    // start timer & logs 
    shrLog("Setting up OpenCL on the Host...\n\n"); 
    shrDeltaT(1);

    // Annotate profiling state
    #ifdef GPU_PROFILING
        shrLog("OpenCL Profiling is enabled...\n\n"); 
    #endif

     //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clGetPlatformID...\n"); 

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clGetDeviceIDs...\n"); 

    //Create the context
    cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateContext...\n");

    // Set up command queue(s) for GPU's specified on the command line or all GPU's
    if(shrCheckCmdLineFlag(argc, (const char **)argv, "device"))
    {
        // User specified GPUs
        int ciMaxDeviceID = ciDeviceCount-1;

        ciDeviceCount = 0;
        char* deviceList;
        char* deviceStr;
        char* next_token;
        shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList);

        #ifdef WIN32
            deviceStr = strtok_s (deviceList," ,.-", &next_token);
        #else
            deviceStr = strtok (deviceList," ,.-");
        #endif   

        // Create command queues for all Requested GPU's
        while(deviceStr != NULL) 
        {
            // get & log device index # and name
            deviceNr[ciDeviceCount] = atoi(deviceStr);
            if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) {
                shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]);
                return 1;
            }

            cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]);
            ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName);

            // create a command que
            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog("clCreateCommandQueue\n"); 

            ++ciDeviceCount;

            #ifdef WIN32
                deviceStr = strtok_s (NULL," ,.-", &next_token);
            #else            
                deviceStr = strtok (NULL," ,.-");
            #endif
        }

        free(deviceList);
    } 
    else 
    {
        // Find out how many GPU's to compute on all available GPUs
        size_t nDeviceBytes;
        ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

        for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
        {
            // get & log device index # and name
            deviceNr[i] = i;
            cdDevice = oclGetDev(cxGPUContext, i);
            ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog(" Device %i: %s\n", i, cDeviceName);

            // create a command que
            commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog("clCreateCommandQueue\n\n"); 
        }
    }

    // Load the OpenCL source code from the .cl file 
    const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]);
    char *source = oclLoadProgSource(source_path, "", &programLength);
    oclCheckError(source != NULL, shrTRUE);
    shrLog("oclLoadProgSource\n"); 

    // Create the program for all GPUs in the context
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateProgramWithSource\n"); 
    
    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS); 
    }
    shrLog("clBuildProgram\n"); 

    // Create host buffer with page-locked memory
    h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR,
                                  DATA_N * sizeof(float), h_Data, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateBuffer (Page-locked Host)\n\n"); 

    // Create buffers for each GPU, with data divided evenly among GPU's
    int sizePerGPU = DATA_N / ciDeviceCount;
    int workOffset[MAX_GPU_COUNT];
    int workSize[MAX_GPU_COUNT];
    workOffset[0] = 0;
    for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
    {
        workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]);        

        // Input buffer
        d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); 

        // Copy data from host to device
        ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 
                                      0, workSize[i] * sizeof(float), 0, NULL, NULL);        
        shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i);

        // Output buffer
        d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateBuffer (Output)\t\tDev %i\n", i);
        
        // Create kernel
        reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateKernel\t\t\tDev %i\n", i); 
        
        // Set the args values and check for errors
        ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]);
        ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]);
        ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clSetKernelArg\t\t\tDev %i\n\n", i);

        workOffset[i + 1] = workOffset[i] + workSize[i];
    }

    // Set # of work items in work group and total in 1 dimensional range
    size_t localWorkSize[] = {THREAD_N};        
    size_t globalWorkSize[] = {ACCUM_N};        

    // Start timer and launch reduction kernel on each GPU, with data split between them 
    shrLog("Launching Kernels on GPU(s)...\n\n");
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {        
        ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize,
                                         0, NULL, &GPUExecution[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
    }
    
    // Copy result from device to host for each device
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
        ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), 
                            h_SumGPU + i *  ACCUM_N, 0, NULL, &GPUDone[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
    }

    // Synchronize with the GPUs and do accumulated error check
    clWaitForEvents(ciDeviceCount, GPUDone);
    shrLog("clWaitForEvents complete...\n\n"); 

    // Aggregate results for multiple GPU's and stop/log processing time
    sumGPU = 0;
    for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++)
    {
         sumGPU += h_SumGPU[i];
    }

    // Print Execution Times for each GPU
    #ifdef GPU_PROFILING
        shrLog("Profiling Information for GPU Processing:\n\n");
        for(unsigned int i = 0; i < ciDeviceCount; i++) 
        {
            cdDevice = oclGetDev(cxGPUContext, deviceNr[i]);
            clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            shrLog("Device %i : %s\n", deviceNr[i], cDeviceName);
            shrLog("  Reduce Kernel     : %.5f s\n", executionTime(GPUExecution[i]));
            shrLog("  Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i]));
        }
    #endif

    // Run the computation on the Host CPU and log processing time 
    shrLog("Launching Host/CPU C++ Computation...\n\n");
    sumCPU = 0;
    for(unsigned int i = 0; i < DATA_N; i++)
    {
        sumCPU += h_Data[i];
    }

    // Check GPU result against CPU result 
    dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU);
    shrLog("Comparing against Host/C++ computation...\n"); 
    shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU);
    shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError);

    // cleanup 
    free(source);
    free(h_Data);
    for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
    {
        clReleaseKernel(reduceKernel[i]);
        clReleaseCommandQueue(commandQueue[i]);
    }
    clReleaseProgram(cpProgram);
    clReleaseContext(cxGPUContext);

    // finish
    shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED);
  }
int main(int argc, char *argv[]){
    // check commandline parameters
    if (argc < 3) {
        fprintf(stderr, "Usage: %s [kernel] [length of vector] [dim]\n",
                argv[0]);
        exit(1);
    }
    
    cl_int errorCode;
    cl_device_type      deviceType = CL_DEVICE_TYPE_CPU;
    cl_device_id *      devices = NULL;
    cl_context          context = NULL;
    cl_command_queue    cmdQueue = NULL;
    cl_program          program = NULL;

    char *kernelfile = argv[1];
    int length = atoi(argv[2]);
    int dim = atoi(argv[3]);

    assert(initialization(
                deviceType,
                devices,
                &context,
                &cmdQueue,
                &program,
                kernelfile));

    float *X = (float*) malloc(sizeof(float)*length);
    float *Y = (float*) malloc(sizeof(float)*length);
    float *Z = (float*) malloc(sizeof(float)*length);

    for (int i = 0; i < length; i++) {
        X[i] = (float)i + 0.1;
        Y[i] = (float)i + 0.2;
        Z[i] = 0.0;
    } 

    cl_mem X_mem, Y_mem, Z_mem;
    ALLOCATE_GPU_READ(X_mem, X, sizeof(float)*length);
    ALLOCATE_GPU_READ(Y_mem, Y, sizeof(float)*length);
    ALLOCATE_GPU_READ_WRITE_INIT(Z_mem, Z, sizeof(float)*length); 
    
    size_t globalSize[1] = {length / dim};
    size_t localSize[1] = {1};

    float alpha = 0.2;
    cl_kernel kernel = clCreateKernel(program, "saxpy_naive", &errorCode); CHECKERROR;
    errorCode = clSetKernelArg(kernel, 0, sizeof(cl_mem), &X_mem); CHECKERROR;
    errorCode = clSetKernelArg(kernel, 1, sizeof(cl_mem), &Y_mem); CHECKERROR;
    errorCode = clSetKernelArg(kernel, 2, sizeof(cl_mem), &Z_mem); CHECKERROR;
    errorCode = clSetKernelArg(kernel, 3, sizeof(cl_float), &alpha); CHECKERROR;
    errorCode = clSetKernelArg(kernel, 4, sizeof(cl_int), &dim); CHECKERROR;

    errorCode = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalSize, localSize, 0, NULL, NULL); CHECKERROR;
    printf("Start to Run ...\n");
    cl_event runEvent;
    errorCode = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalSize, localSize, 0, NULL, &runEvent); CHECKERROR;
    errorCode = clFinish(cmdQueue);

    printf("Execution Time: %.2fns\n", executionTime(runEvent) / length * 1e9);

    printf("Start to Readback ...\n");
    errorCode = clEnqueueReadBuffer(cmdQueue, Z_mem, CL_TRUE, 0, sizeof(float)*length, Z, 0, NULL, NULL); CHECKERROR;
    
    printf("Checking Correctness ...\n");
    
    for (int i = 0; i < length; i++) {
        float res = X[i] * alpha + Y[i];
        float ans = Z[i];
        if (res - ans > 1E-4 || res - ans < -1E-4) {
            printf("%.10f %.10f %.10f\n", res, ans, res-ans);
            fprintf(stderr, "ERROR!");
            exit(1);
        }
    }   
    printf("OK\n");

    return 0;
}
void PhotometricStereo::execute() {

    /* measuring ps performance */
    long start = getMilliSecs();

    /* creating OpenCL buffers */
    size_t imgSize3 = sizeof(float) * (height*width*3);
    size_t gradSize = sizeof(float) * (height*width);
    size_t sSize = sizeof(float) * (lightSrcsInv.rows*lightSrcsInv.cols*lightSrcsInv.channels());

    cl::ImageFormat imgFormat = cl::ImageFormat(CL_INTENSITY, CL_UNORM_INT8);

    cl_img1 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error);
    cl_img2 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error);
    cl_img3 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error);
    cl_img4 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error);
    cl_img5 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error);
    cl_img6 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error);
    cl_img7 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error);
    cl_img8 = cl::Image2D(context, CL_MEM_READ_ONLY, imgFormat, width, height, 0, NULL, &error);
    cl_Sinv = cl::Buffer(context, CL_MEM_READ_ONLY, sSize, NULL, &error);
    cl_Pgrads = cl::Buffer(context, CL_MEM_WRITE_ONLY, gradSize, NULL, &error);
    cl_Qgrads = cl::Buffer(context, CL_MEM_WRITE_ONLY, gradSize, NULL, &error);
    cl_N = cl::Buffer(context, CL_MEM_WRITE_ONLY, imgSize3, NULL, &error);

    /* pushing data to CPU */
    cv::Mat Normals(height, width, CV_32FC3, cv::Scalar::all(0));
    cv::Mat Pgrads(height, width, CV_32F, cv::Scalar::all(0));
    cv::Mat Qgrads(height, width, CV_32F, cv::Scalar::all(0));

    cl::size_t<3> origin; origin[0] = 0; origin[1] = 0; origin[2] = 0;
    cl::size_t<3> region; region[0] = width; region[1] = height; region[2] = 1;

    mutex.lock();
    queue.enqueueWriteImage(cl_img1, CL_TRUE, origin, region, 0, 0, psImages.at(0).data);
    queue.enqueueWriteImage(cl_img2, CL_TRUE, origin, region, 0, 0, psImages.at(1).data);
    queue.enqueueWriteImage(cl_img3, CL_TRUE, origin, region, 0, 0, psImages.at(2).data);
    queue.enqueueWriteImage(cl_img4, CL_TRUE, origin, region, 0, 0, psImages.at(3).data);
    queue.enqueueWriteImage(cl_img5, CL_TRUE, origin, region, 0, 0, psImages.at(4).data);
    queue.enqueueWriteImage(cl_img6, CL_TRUE, origin, region, 0, 0, psImages.at(5).data);
    queue.enqueueWriteImage(cl_img7, CL_TRUE, origin, region, 0, 0, psImages.at(6).data);
    queue.enqueueWriteImage(cl_img8, CL_TRUE, origin, region, 0, 0, psImages.at(7).data);
    mutex.unlock();
    queue.enqueueWriteBuffer(cl_Sinv, CL_TRUE, 0, sSize, lightSrcsInv.data, NULL, &event);
    queue.enqueueWriteBuffer(cl_Pgrads, CL_TRUE, 0, gradSize, Pgrads.data, NULL, &event);
    queue.enqueueWriteBuffer(cl_Qgrads, CL_TRUE, 0, gradSize, Qgrads.data, NULL, &event);
    queue.enqueueWriteBuffer(cl_N, CL_TRUE, 0, imgSize3, Normals.data, NULL, &event);

    /* set kernel arguments */
    calcNormKernel.setArg(0, cl_img1); // 1-8 images
    calcNormKernel.setArg(1, cl_img2);
    calcNormKernel.setArg(2, cl_img3);
    calcNormKernel.setArg(3, cl_img4);
    calcNormKernel.setArg(4, cl_img5);
    calcNormKernel.setArg(5, cl_img6);
    calcNormKernel.setArg(6, cl_img7);
    calcNormKernel.setArg(7, cl_img8);
    calcNormKernel.setArg(8, width); // required for..
    calcNormKernel.setArg(9, height); // ..determining array dimensions
    calcNormKernel.setArg(10, cl_Sinv); // inverse of light matrix
    calcNormKernel.setArg(11, cl_Pgrads); // P gradients
    calcNormKernel.setArg(12, cl_Qgrads); // Q gradients
    calcNormKernel.setArg(13, cl_N); // normals for each point
    calcNormKernel.setArg(14, maxpq); // max depth gradients as in [Wei2001]
    calcNormKernel.setArg(15, minIntensity); // exaggerate slope as in [Malzbender2006]

    /* wait for command queue to finish before continuing */
    queue.finish();

    /* executing kernel */
    queue.enqueueNDRangeKernel(calcNormKernel, cl::NullRange, cl::NDRange(height, width), cl::NullRange, NULL, &event);
    queue.finish();

    /* reading back from CPU device */
    queue.enqueueReadBuffer(cl_Pgrads, CL_TRUE, 0, gradSize, Pgrads.data);
    queue.enqueueReadBuffer(cl_Qgrads, CL_TRUE, 0, gradSize, Qgrads.data);
    queue.enqueueReadBuffer(cl_N, CL_TRUE, 0, sizeof(float) * (height*width*3), Normals.data);

    /* integrate and get heights globally */
    cv::Mat Zcoords = getGlobalHeights(Pgrads, Qgrads);
    
    /* pushing normals to CPU again */
    cl_N = cl::Buffer(context, CL_MEM_READ_WRITE, imgSize3, NULL, &error);
    queue.enqueueWriteBuffer(cl_N, CL_TRUE, 0, imgSize3, Normals.data);
    
    /*  unsharp masking as in [Malzbender2006] */
    updateNormKernel.setArg(0, cl_N);
    updateNormKernel.setArg(1, width);
    updateNormKernel.setArg(2, height);
    updateNormKernel.setArg(3, cl_Pgrads);
    updateNormKernel.setArg(4, cl_Qgrads);
    updateNormKernel.setArg(5, unsharpScaleFactor);
    
    /* executing kernel updating normals */
    queue.enqueueNDRangeKernel(updateNormKernel, cl::NullRange, cl::NDRange(height, width), cl::NullRange, NULL, &event);
    queue.finish();
    
    /* reading back from CPU device */
    queue.enqueueReadBuffer(cl_Pgrads, CL_TRUE, 0, gradSize, Pgrads.data);
    queue.enqueueReadBuffer(cl_Qgrads, CL_TRUE, 0, gradSize, Qgrads.data);
    queue.enqueueReadBuffer(cl_N, CL_TRUE, 0, imgSize3, Normals.data);
    
    /* integrate updated gradients second time */
    Zcoords = getGlobalHeights(Pgrads, Qgrads);

    /* store 3d data and normals tensor-like */
    std::vector<cv::Mat> matVec;
    matVec.push_back(XCoords);
    matVec.push_back(YCoords);
    matVec.push_back(Zcoords);
    matVec.push_back(Normals);

    emit executionTime("Elapsed time: " + QString::number(getMilliSecs() - start) + " ms.");
    emit modelFinished(matVec);
}