int main(int argc, char *argv[])
{
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem buffer;
    cl_int error;
    cl_event event;
    cl_ulong startTime, endTime;
    size_t globalSize[1], localSize[1], warpSize;
    FILE* fptr;
    unsigned long long start, end;

    void* hostData = NULL;

    /* Parse options */
    CommandParser(argc, argv);
    HostDataCreation(hostData);

    GetPlatformAndDevice(platform, device);
    fptr = fopen(g_opencl_ctrl.powerFile, "a");

    /* Create context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
    CHECK_CL_ERROR(error);

    /* Create command queue */
#ifdef USE_CL_2_0_API
    {
        cl_queue_properties property[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
        command_queue = clCreateCommandQueueWithProperties(context, device, property, &error);
    }
#else
    {
        command_queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &error);
    }
#endif
    CHECK_CL_ERROR(error);

    /* Create program */
    CreateAndBuildProgram(program, context, device, strdup(g_opencl_ctrl.fileName));

    /* Create kernels */
    kernel = clCreateKernel(program, g_opencl_ctrl.kernelName, &error);
    CHECK_CL_ERROR(error);

    error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &warpSize, NULL);
    CHECK_CL_ERROR(error);
    fprintf(stderr, "Preferred work group size: %lu\n", warpSize);

#if 0
    fprintf(stderr, "\nData before process:\n");
    switch (g_opencl_ctrl.dataType)
    {
        case TYPE_INT:
            {
                int *intptr = (int *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%d ", intptr[i]);
                fprintf(stderr, "\n");
            }
            break;
        case TYPE_FLOAT:
            {
                float *fltptr = (float *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%f ", fltptr[i]);
                fprintf(stderr, "\n");
            }
            break;
       case TYPE_DOUBLE:
            {
                double *dblptr = (double *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%lf ", dblptr[i]);
                fprintf(stderr, "\n");
            }
            break;
    }
#endif

    /* Create buffers */
    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, g_opencl_ctrl.dataByte, hostData, &error);
    CHECK_CL_ERROR(error);

    /* Execute kernels */
    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel, 1, sizeof(long), &g_opencl_ctrl.iteration);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel, 2, sizeof(int), &g_opencl_ctrl.interval);
    CHECK_CL_ERROR(error);

    start = PrintTimingInfo(fptr);

    globalSize[0] = g_opencl_ctrl.global_size;
    localSize[0] = g_opencl_ctrl.local_size;
    error = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalSize, localSize, 0, NULL, &event);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);

    end = PrintTimingInfo(fptr);
    fclose(fptr);

    error = clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, g_opencl_ctrl.dataByte, hostData, 0, NULL, NULL);
    CHECK_CL_ERROR(error);

#if 0
    fprintf(stderr, "\nData after process:\n");
    switch (g_opencl_ctrl.dataType)
    {
        case TYPE_INT:
            {
                int *intptr = (int *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%d ", intptr[i]);
                fprintf(stderr, "\n");
            }
            break;
        case TYPE_FLOAT:
            {
                float *fltptr = (float *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%f ", fltptr[i]);
                fprintf(stderr, "\n");
            }
            break;
       case TYPE_DOUBLE:
            {
                double *dblptr = (double *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%lf ", dblptr[i]);
                fprintf(stderr, "\n");
            }
            break;
    }
#endif

    /* Event profiling */
    error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL);
    CHECK_CL_ERROR(error);
    error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(endTime), &endTime, NULL);
    CHECK_CL_ERROR(error);
    fprintf(stderr, "\n['%s' execution time] %llu ns\n", g_opencl_ctrl.kernelName, (end - start) * 1000);
    fprintf(stdout, "%llu\n", (end - start) * 1000);

    /* Read the output */

    /* Release object */
    clReleaseKernel(kernel);
    clReleaseMemObject(buffer);
    clReleaseEvent(event);
    clReleaseProgram(program);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    free(hostData);

    return 0;
}
Beispiel #2
0
/*
 * main execution routine
 * Basically it consists of three parts:
 *   - generating the inputs
 *   - running OpenCL kernel
 *   - reading results of processing
 */
int _tmain(int argc, TCHAR* argv[])
{
    cl_int err;
    ocl_args_d_t ocl;
    cl_device_type deviceType = CL_DEVICE_TYPE_GPU;

    LARGE_INTEGER perfFrequency;
    LARGE_INTEGER performanceCountNDRangeStart;
    LARGE_INTEGER performanceCountNDRangeStop;

    cl_uint arrayWidth = 1024;
    cl_uint arrayHeight = 1024;

    //initialize Open CL objects (context, queue, etc.)
    if (CL_SUCCESS != SetupOpenCL(&ocl, deviceType))
    {
        return -1;
    }

    // allocate working buffers. 
    // the buffer should be aligned with 4K page and size should fit 64-byte cached line
    cl_uint optimizedSize = ((sizeof(cl_int) * arrayWidth * arrayHeight - 1) / 64 + 1) * 64;
    cl_int* inputA = (cl_int*)_aligned_malloc(optimizedSize, 4096);
    cl_int* inputB = (cl_int*)_aligned_malloc(optimizedSize, 4096);
    cl_int* outputC = (cl_int*)_aligned_malloc(optimizedSize, 4096);
    if (NULL == inputA || NULL == inputB || NULL == outputC)
    {
        LogError("Error: _aligned_malloc failed to allocate buffers.\n");
        return -1;
    }

    //random input
    generateInput(inputA, arrayWidth, arrayHeight);
    generateInput(inputB, arrayWidth, arrayHeight);

    // Create OpenCL buffers from host memory
    // These buffers will be used later by the OpenCL kernel
    if (CL_SUCCESS != CreateBufferArguments(&ocl, inputA, inputB, outputC, arrayWidth, arrayHeight))
    {
        return -1;
    }

    // Create and build the OpenCL program
    if (CL_SUCCESS != CreateAndBuildProgram(&ocl))
    {
        return -1;
    }

    // Program consists of kernels.
    // Each kernel can be called (enqueued) from the host part of OpenCL application.
    // To call the kernel, you need to create it from existing program.
    ocl.kernel = clCreateKernel(ocl.program, "Add", &err);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clCreateKernel returned %s\n", TranslateOpenCLError(err));
        return -1;
    }

    // Passing arguments into OpenCL kernel.
    if (CL_SUCCESS != SetKernelArguments(&ocl))
    {
        return -1;
    }

    // Regularly you wish to use OpenCL in your application to achieve greater performance results
    // that are hard to achieve in other ways.
    // To understand those performance benefits you may want to measure time your application spent in OpenCL kernel execution.
    // The recommended way to obtain this time is to measure interval between two moments:
    //   - just before clEnqueueNDRangeKernel is called, and
    //   - just after clFinish is called
    // clFinish is necessary to measure entire time spending in the kernel, measuring just clEnqueueNDRangeKernel is not enough,
    // because this call doesn't guarantees that kernel is finished.
    // clEnqueueNDRangeKernel is just enqueue new command in OpenCL command queue and doesn't wait until it ends.
    // clFinish waits until all commands in command queue are finished, that suits your need to measure time.
    bool queueProfilingEnable = true;
    if (queueProfilingEnable)
        QueryPerformanceCounter(&performanceCountNDRangeStart);
    // Execute (enqueue) the kernel
    if (CL_SUCCESS != ExecuteAddKernel(&ocl, arrayWidth, arrayHeight))
    {
        return -1;
    }
    if (queueProfilingEnable)
        QueryPerformanceCounter(&performanceCountNDRangeStop);

    // The last part of this function: getting processed results back.
    // use map-unmap sequence to update original memory area with output buffer.
    ReadAndVerify(&ocl, arrayWidth, arrayHeight, inputA, inputB);

    // retrieve performance counter frequency
    if (queueProfilingEnable)
    {
        QueryPerformanceFrequency(&perfFrequency);
        LogInfo("NDRange performance counter time %f ms.\n",
            1000.0f*(float)(performanceCountNDRangeStop.QuadPart - performanceCountNDRangeStart.QuadPart) / (float)perfFrequency.QuadPart);
    }

    _aligned_free(inputA);
    _aligned_free(inputB);
    _aligned_free(outputC);

#if defined(_DEBUG)
    getchar();
#endif

    return 0;
}
int main(int argc, char *argv[])
{
    FILE* g_fptr;
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    cl_kernel kernel1, kernel2;
    cl_mem inputBufferA;
    cl_int error;
    size_t globalSize[2], localSize[2];

    struct timeval startTime, endTime;

    void* inputMatrixA = NULL;
    /* Parse options */
    CommandParser(argc, argv);

    g_fptr = fopen(g_opencl_ctrl.powerFile, "a");
    if (!g_fptr)
        exit(1);

    HostDataCreation(inputMatrixA);

    GetPlatformAndDevice(platform, device);

    /* Create context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
    CHECK_CL_ERROR(error);

    /* Create command queue */
    command_queue = clCreateCommandQueue(context, device, 0, &error);
    CHECK_CL_ERROR(error);

    /* Create program */
    CreateAndBuildProgram(program, context, device, strdup(CL_FILE_NAME));

    /* Create kernels */
    kernel1 = clCreateKernel(program, "Generate", &error);
    CHECK_CL_ERROR(error);
    kernel2 = clCreateKernel(program, "Access", &error);
    CHECK_CL_ERROR(error);

    /* Create buffers */
    inputBufferA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, g_opencl_ctrl.inputByteA, inputMatrixA, &error);
    CHECK_CL_ERROR(error);

    /* Execute kernels */
    error = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &inputBufferA);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel1, 1, sizeof(int), &g_opencl_ctrl.dataSizeW);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel1, 2, sizeof(int), &g_opencl_ctrl.dataSizeH);
    CHECK_CL_ERROR(error);

    error = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &inputBufferA);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel2, 1, sizeof(int), &g_opencl_ctrl.dataSizeW);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel2, 2, sizeof(int), &g_opencl_ctrl.iteration);
    CHECK_CL_ERROR(error);


    globalSize[0] = g_opencl_ctrl.dataSizeW;
    globalSize[1] = g_opencl_ctrl.dataSizeH;
    localSize[0] = g_opencl_ctrl.local_size1;
    localSize[1] = g_opencl_ctrl.local_size2;

    fprintf(stderr, "global size: %lu %lu\n", globalSize[0], globalSize[1]);
    fprintf(stderr, "local size: %lu %lu\n", localSize[0], localSize[1]);

    error = clEnqueueNDRangeKernel(command_queue, kernel1, 2, NULL, globalSize, localSize, 0, NULL, NULL);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);
 
    PrintTimingInfo(g_fptr);

    if (g_opencl_ctrl.timing)
        gettimeofday(&startTime, NULL);

    error = clEnqueueNDRangeKernel(command_queue, kernel2, 2, NULL, globalSize, localSize, 0, NULL, NULL);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);
    PrintTimingInfo(g_fptr);

    if (g_opencl_ctrl.timing)
        gettimeofday(&endTime, NULL);

    fclose(g_fptr);

    /* Read the output */
    error = clEnqueueReadBuffer(command_queue, inputBufferA, CL_TRUE, 0, g_opencl_ctrl.inputByteA, inputMatrixA, 0, NULL, NULL);
    CHECK_CL_ERROR(error);

    /* Release object */
    clReleaseKernel(kernel1);
    clReleaseKernel(kernel2);
    clReleaseMemObject(inputBufferA);
    clReleaseProgram(program);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    free(inputMatrixA);

    if (g_opencl_ctrl.timing)
    {
        unsigned long long start, end;
        start = startTime.tv_sec * 1000000 + startTime.tv_usec;
        end = endTime.tv_sec * 1000000 + endTime.tv_usec;

        fprintf(stderr, "Kernel execution time: %llu ms\n", (end - start) / 1000);
        fprintf(stdout, "%llu\n", (end - start) * 1000);
    }

    fprintf(stderr, "DONE.\n");

    return 0;
}
Beispiel #4
0
int main(int argc, char *argv[])
{
    struct timeval begin, end;
    gettimeofday(&begin, NULL);
    printf("OpenCL Initialization\n");

    cl_int err = CL_SUCCESS;
    
    struct timeval begin_init, end_init;
    gettimeofday(&begin_init, NULL);


    if(!createContext())
    {
        printf("Error: createContext\n");
        return 0;
    }

    if(!getDeviceIDs())
    {
        printf("Error: getDeviceIDs\n");
        return 0;
    }

    generateArgument();

    if(CreateAndBuildProgram() != CL_SUCCESS)
    {
        printf("Error: CreateAndBuildProgram\n");
        return 0;
    }

    if(CreateBufferArguments() != CL_SUCCESS)
    {
        printf("Error: CreateBufferArguments\n");
        return 0;
    }

    ocl.kernel = clCreateKernel(ocl.program, "ray_cal", &err);
    if(err != CL_SUCCESS)
    {
        printf("Error: clCreateKernel\n");
        return 0;
    }

    if(SetKernelArguments() != CL_SUCCESS)
    {
        printf("Error: SetKernelArguments\n");
        return 0;
    }

    gettimeofday(&end_init, NULL);
    printf("init elapsed time : %lfs\n", (double)timeval_diff(&end_init, &begin_init)/1000000);
    
    struct timeval begin_kernel, end_kernel;
    gettimeofday(&begin_kernel, NULL);

    srand((unsigned int)time(NULL));
    pthread_t t1, t2;

    int tmpvalue = WIDTH*HEIGHT/WorkAmount;

    printf("total recursive %d\n", tmpvalue);

    //PASSING_OCL ocl_info[2] = {{ocl, 0, 0, tmpvalue*5/8, 256}, {ocl, 1, tmpvalue*5/8, tmpvalue, 256}};
    
    PASSING_OCL ocl_info[2] = {{ocl, 0, 0, 8, 256}, {ocl, 1, 8, 16, 256}};
    //int joinstatus
    pthread_create(&t1, NULL, RenderDisplay, (void *)&ocl_info[0]);
    pthread_create(&t2, NULL, RenderDisplay, (void *)&ocl_info[1]);
    pthread_join(t1, NULL);
    pthread_join(t2, NULL); 
    
    gettimeofday(&end_kernel, NULL);
    printf("render elapsed time : %lfs\n", (double)timeval_diff(&end_kernel, &begin_kernel)/1000000);
     
    /* 
    std::ostringstream headerStream;
    headerStream << "P6\n";
    headerStream << Width << ' ' << Height << '\n';
    headerStream << "255\n";
    std::ofstream fileStream("out.ppm", std::ios::out | std::ios::binary);

    fileStream << headerStream.str();

    for (unsigned int j = 0; j < Width*Height; j++)
    {
        unsigned char r, g, b;
        unsigned int tmp = ImagePixel[j];
        r = (unsigned char)((tmp >> 16) & 0xFF);
        g = (unsigned char)((tmp >> 8) & 0xFF);
        b = (unsigned char)((tmp)& 0xFF);
        fileStream << r << g << b;
    }
    
    fileStream.flush();
    fileStream.close();
    */
    //gettimeofday(&end_kernel, NULL);
    //printf("render elapsed time : %lfs\n", (double)timeval_diff(&end_kernel, &begin_kernel)/1000000);
     
    gettimeofday(&end, NULL);
    printf("elapsed time : %lfs\n", (double)timeval_diff(&end, &begin)/1000000);
    
    printf("End Success\n");

    return 0;
}