Example #1
0
CLWProgram CLWProgram::CreateFromSource(char const* sourcecode,
                                        size_t sourcesize,
                                        char const** headers,
                                        char const** headernames,
                                        size_t* headersizes,
                                        int numheaders,
                                        CLWContext context)
{
    cl_int status = CL_SUCCESS;
    
    std::vector<cl_device_id> deviceIds(context.GetDeviceCount());
    for(unsigned int i = 0; i < context.GetDeviceCount(); ++i)
    {
        deviceIds[i] = context.GetDevice(i);
    }
    
    char const* buildopts =
#if defined(__APPLE__)
    "-D APPLE -cl-mad-enable -cl-fast-relaxed-math -cl-std=CL1.2 -I ."
#elif defined(_WIN32) || defined (WIN32)
    "-D WIN32 -cl-mad-enable -cl-fast-relaxed-math -cl-std=CL1.2 -I."
#elif defined(__linux__)
    "-D __linux__ -I."
#else
    nullptr
#endif
    ;
    
    std::vector<cl_program> headerPrograms(numheaders);
    for (int i=0; i<numheaders; ++i)
    {
        size_t sourceSize = headersizes[i];
        char const* tempPtr = headers[i];
        headerPrograms[i] = clCreateProgramWithSource(context, 1, (const char**)&tempPtr, &sourceSize, &status);
        ThrowIf(status != CL_SUCCESS, status, "clCreateProgramWithSource failed");
    }
    
    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&sourcecode, &sourcesize, &status);
    
    ThrowIf(status != CL_SUCCESS, status, "clCreateProgramWithSource failed");
    
    status = clCompileProgram(program, context.GetDeviceCount(), &deviceIds[0], buildopts, numheaders, &headerPrograms[0], headernames, nullptr, nullptr);
    
    if(status != CL_SUCCESS)
    {
        std::vector<char> buildLog;
        size_t logSize;
        clGetProgramBuildInfo(program, deviceIds[0], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize);
        
        buildLog.resize(logSize);
        clGetProgramBuildInfo(program, deviceIds[0], CL_PROGRAM_BUILD_LOG, logSize, &buildLog[0], nullptr);
        
#ifdef _DEBUG
        std::cout << &buildLog[0] << "\n";
#endif
        
        throw CLWException(status, std::string(&buildLog[0]));
    }
    
    status = clBuildProgram(program, context.GetDeviceCount(), &deviceIds[0], buildopts, nullptr, nullptr);
    
    if(status != CL_SUCCESS)
    {
        std::vector<char> buildLog;
        size_t logSize;
        clGetProgramBuildInfo(program, deviceIds[0], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize);
        
        buildLog.resize(logSize);
        clGetProgramBuildInfo(program, deviceIds[0], CL_PROGRAM_BUILD_LOG, logSize, &buildLog[0], nullptr);
        
#ifdef _DEBUG
        std::cout << &buildLog[0] << "\n";
#endif
        
        throw CLWException(status, std::string(&buildLog[0]));
    }
    
    CLWProgram prg(program);
    
    clReleaseProgram(program);
    
    return prg;
}
void buildOpenCLKernels_calc_dt_kernel_print(int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5) {

  //int ocl_fma = OCL_FMA;
  if(!isbuilt_calc_dt_kernel_print) {
    buildOpenCLKernels();
    //clSafeCall( clUnloadCompiler() );
    cl_int ret;
    char* source_filename[1] = {"./OpenCL/calc_dt_kernel_print.cl"};

    // Load the kernel source code into the array source_str
    FILE *fid;
    char *source_str[1];
    size_t source_size[1];

    for(int i=0; i<1; i++) {
      fid = fopen(source_filename[i], "r");
      if (!fid) {
        fprintf(stderr, "Can't open the kernel source file!\n");
        exit(1);
      }

      source_str[i] = (char*)malloc(4*0x1000000);
      source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid);
      if(source_size[i] != 4*0x1000000) {
        if (ferror(fid)) {
          printf ("Error while reading kernel source file %s\n", source_filename[i]);
          exit(-1);
        }
        if (feof(fid))
          printf ("Kernel source file %s succesfuly read.\n", source_filename[i]);
          //printf("%s\n",source_str[i]);
      }
      fclose(fid);
    }

    printf("Compiling calc_dt_kernel_print %d source -- start \n",OCL_FMA);

      // Create a program from the source
      OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret);
      clSafeCall( ret );

      // Build the program
      char buildOpts[255*7];
      char* pPath = NULL;
      pPath = getenv ("OPS_INSTALL_PATH");
      if (pPath!=NULL)
        if(OCL_FMA)
          sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d  -Dxdim0_calc_dt_kernel_print=%d  -Dxdim1_calc_dt_kernel_print=%d  -Dxdim2_calc_dt_kernel_print=%d  -Dxdim3_calc_dt_kernel_print=%d  -Dxdim4_calc_dt_kernel_print=%d  -Dxdim5_calc_dt_kernel_print=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5);
        else
          sprintf(buildOpts,"-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d  -Dxdim0_calc_dt_kernel_print=%d  -Dxdim1_calc_dt_kernel_print=%d  -Dxdim2_calc_dt_kernel_print=%d  -Dxdim3_calc_dt_kernel_print=%d  -Dxdim4_calc_dt_kernel_print=%d  -Dxdim5_calc_dt_kernel_print=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5);
      else {
        sprintf("Incorrect OPS_INSTALL_PATH %s\n",pPath);
        exit(EXIT_FAILURE);
      }

      ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL);

      if(ret != CL_SUCCESS) {
        char* build_log;
        size_t log_size;
        clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) );
        build_log = (char*) malloc(log_size+1);
        clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) );
        build_log[log_size] = '\0';
        fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log);
        fprintf(stderr, "\n========================================================= \n");
        free(build_log);
        exit(EXIT_FAILURE);
      }
      printf("compiling calc_dt_kernel_print -- done\n");

    // Create the OpenCL kernel
    OPS_opencl_core.kernel[30] = clCreateKernel(OPS_opencl_core.program, "ops_calc_dt_kernel_print", &ret);
    clSafeCall( ret );

    isbuilt_calc_dt_kernel_print = true;
  }

}
/**
 * The implementation of the particle filter using OpenMP for many frames
 * @see http://openmp.org/wp/
 * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods
 * @param I The video to be run
 * @param IszX The x dimension of the video
 * @param IszY The y dimension of the video
 * @param Nfr The number of frames
 * @param seed The seed array used for random number generation
 * @param Nparticles The number of particles to be used
 */
int particleFilter(unsigned char * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles) {
    int max_size = IszX * IszY*Nfr;
    //original particle centroid
    double xe = roundDouble(IszY / 2.0);
    double ye = roundDouble(IszX / 2.0);

    //expected object locations, compared to center
    int radius = 5;
    int diameter = radius * 2 - 1;
    int * disk = (int*) calloc(diameter * diameter, sizeof (int));
    strelDisk(disk, radius);
    int countOnes = 0;
    int x, y;
    for (x = 0; x < diameter; x++) {
        for (y = 0; y < diameter; y++) {
            if (disk[x * diameter + y] == 1)
                countOnes++;
        }
    }
    int * objxy = (int *) calloc(countOnes * 2, sizeof(int));
    getneighbors(disk, countOnes, objxy, radius);
    //initial weights are all equal (1/Nparticles)
    double * weights = (double *) calloc(Nparticles, sizeof(double));
    for (x = 0; x < Nparticles; x++) {
        weights[x] = 1 / ((double) (Nparticles));
    }
    /****************************************************************
     **************   B E G I N   A L L O C A T E *******************
     ****************************************************************/

    /***** kernel variables ******/
    cl_kernel kernel_likelihood;
    cl_kernel kernel_sum;
    cl_kernel kernel_normalize_weights;
    cl_kernel kernel_find_index;

    int sourcesize = 2048 * 2048;
    char * source = (char *) calloc(sourcesize, sizeof (char));
    if (!source) {
        printf("ERROR: calloc(%d) failed\n", sourcesize);
        return -1;
    }

    // read the kernel core source
    char * tempchar = "./particle_double.cl";
    FILE * fp = fopen(tempchar, "rb");
    if (!fp) {
        printf("ERROR: unable to open '%s'\n", tempchar);
        return -1;
    }
    fread(source + strlen(source), sourcesize, 1, fp);
    fclose(fp);

    // OpenCL initialization
    int use_gpu = 1;
    if (initialize(use_gpu)) return -1;

    // compile kernel
    cl_int err = 0;
    const char * slist[2] = {source, 0};
    cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateProgramWithSource() => %d\n", err);
        return -1;
    }

    err = clBuildProgram(prog, 1, device_list, "-cl-fast-relaxed-math", NULL, NULL);

    if (err != CL_SUCCESS) {
        if (err == CL_INVALID_PROGRAM)
            printf("CL_INVALID_PROGRAM\n");
        else if (err == CL_INVALID_VALUE)
            printf("CL_INVALID_VALUE\n");
        else if (err == CL_INVALID_DEVICE)
            printf("CL_INVALID_DEVICE\n");
        else if (err == CL_INVALID_BINARY)
            printf("CL_INVALID_BINARY\n");
        else if (err == CL_INVALID_BUILD_OPTIONS)
            printf("CL_INVALID_BUILD_OPTIONS\n");
        else if (err == CL_INVALID_OPERATION)
            printf("CL_INVALID_OPERATION\n");
        else if (err == CL_COMPILER_NOT_AVAILABLE)
            printf("CL_COMPILER_NOT_AVAILABLE\n");
        else if (err == CL_BUILD_PROGRAM_FAILURE)
            printf("CL_BUILD_PROGRAM_FAILURE\n");
        else if (err == CL_INVALID_OPERATION)
            printf("CL_INVALID_OPERATION\n");
        else if (err == CL_OUT_OF_RESOURCES)
            printf("CL_OUT_OF_RESOURCES\n");
        else if (err == CL_OUT_OF_HOST_MEMORY)
            printf("CL_OUT_OF_HOST_MEMORY\n");

        printf("ERROR: clBuildProgram() => %d\n", err);

        static char log[65536];
        memset(log, 0, sizeof (log));

        err = clGetProgramBuildInfo(prog, device_list[0], CL_PROGRAM_BUILD_LOG, sizeof (log) - 1, log, NULL);
        if (err != CL_SUCCESS) {
            printf("ERROR: clGetProgramBuildInfo() => %d\n", err);
        }
        if (strstr(log, "warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);


    }
    // { // show warnings/errors
    //     static char log[65536];
    //     memset(log, 0, sizeof (log));
    //     cl_device_id device_id[2] = {0};
    //     err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof (device_id), device_id, NULL);
    //     if (err != CL_SUCCESS) {
    //         if (err == CL_INVALID_CONTEXT)
    //             printf("ERROR: clGetContextInfo() => CL_INVALID_CONTEXT\n");
    //         if (err == CL_INVALID_VALUE)
    //             printf("ERROR: clGetContextInfo() => CL_INVALID_VALUE\n");
    //     }
    // }//*/

    char * s_likelihood_kernel = "likelihood_kernel";
    char * s_sum_kernel = "sum_kernel";
    char * s_normalize_weights_kernel = "normalize_weights_kernel";
    char * s_find_index_kernel = "find_index_kernel";

    kernel_likelihood = clCreateKernel(prog, s_likelihood_kernel, &err);
    if (err != CL_SUCCESS) {
        if (err == CL_INVALID_PROGRAM)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM %d\n", err);
        if (err == CL_INVALID_PROGRAM_EXECUTABLE)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM EXECUTABLE %d\n", err);
        if (err == CL_INVALID_KERNEL_NAME)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL NAME %d\n", err);
        if (err == CL_INVALID_KERNEL_DEFINITION)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL DEFINITION %d\n", err);
        if (err == CL_INVALID_VALUE)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID CL_INVALID_VALUE %d\n", err);
        printf("ERROR: clCreateKernel(likelihood_kernel) failed.\n");
        return -1;
    }
    kernel_sum = clCreateKernel(prog, s_sum_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(sum_kernel) 0 => %d\n", err);
        return -1;
    }
    kernel_normalize_weights = clCreateKernel(prog, s_normalize_weights_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(normalize_weights_kernel) 0 => %d\n", err);
        return -1;
    }
    kernel_find_index = clCreateKernel(prog, s_find_index_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(find_index_kernel) 0 => %d\n", err);
        return -1;
    }


    //initial likelihood to 0.0
    double * likelihood = (double *) calloc(Nparticles + 1, sizeof (double));
    double * arrayX = (double *) calloc(Nparticles, sizeof (double));
    double * arrayY = (double *) calloc(Nparticles, sizeof (double));
    double * xj = (double *) calloc(Nparticles, sizeof (double));
    double * yj = (double *) calloc(Nparticles, sizeof (double));
    double * CDF = (double *) calloc(Nparticles, sizeof(double));

    //GPU copies of arrays
    cl_mem arrayX_GPU;
    cl_mem arrayY_GPU;
    cl_mem xj_GPU;
    cl_mem yj_GPU;
    cl_mem CDF_GPU;
    cl_mem likelihood_GPU;
    cl_mem I_GPU;
    cl_mem weights_GPU;
    cl_mem objxy_GPU;

    int * ind = (int*) calloc(countOnes, sizeof(int));
    cl_mem ind_GPU;
    double * u = (double *) calloc(Nparticles, sizeof(double));
    cl_mem u_GPU;
    cl_mem seed_GPU;
    cl_mem partial_sums;


    //OpenCL memory allocation

    arrayX_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    arrayY_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    xj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer xj_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    yj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer yj_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    CDF_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) * Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer CDF_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    u_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer u_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    likelihood_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer likelihood_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    weights_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer weights_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    I_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (unsigned char) *IszX * IszY * Nfr, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer I_GPU (size:%d) => %d\n", IszX * IszY * Nfr, err);
        return -1;
    }
    objxy_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 2*sizeof (int) *countOnes, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer objxy_GPU (size:%d) => %d\n", countOnes, err);
        return -1;
    }
    ind_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *countOnes * Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer ind_GPU (size:%d) => %d\n", countOnes * Nparticles, err);
        return -1;
    }
    seed_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer seed_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    partial_sums = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof (double) * Nparticles + 1, likelihood, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer partial_sums (size:%d) => %d\n", Nparticles, err);
        return -1;
    }

	//Donnie - this loop is different because in this kernel, arrayX and arrayY
    //  are set equal to xj before every iteration, so effectively, arrayX and
    //  arrayY will be set to xe and ye before the first iteration.
    for (x = 0; x < Nparticles; x++) {

        xj[x] = xe;
        yj[x] = ye;
    }

    int k;
    //double * Ik = (double *)calloc(IszX*IszY, sizeof(double));
    int indX, indY;
    //start send
    long long send_start = get_time();

    //OpenCL memory copy
    err = clEnqueueWriteBuffer(cmd_queue, I_GPU, 1, 0, sizeof (unsigned char) *IszX * IszY*Nfr, I, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer I_GPU (size:%d) => %d\n", IszX * IszY*Nfr, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, objxy_GPU, 1, 0, 2*sizeof (int) *countOnes, objxy, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer objxy_GPU (size:%d) => %d\n", countOnes, err);
        return -1; }
    err = clEnqueueWriteBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer weights_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, xj_GPU, 1, 0, sizeof (double) *Nparticles, xj, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, yj_GPU, 1, 0, sizeof (double) *Nparticles, yj, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, seed_GPU, 1, 0, sizeof (int) *Nparticles, seed, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer seed_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    /**********************************************************************
     *********** E N D    A L L O C A T E ********************************
     *********************************************************************/

    long long send_end = get_time();
    printf("TIME TO SEND TO GPU: %f\n", elapsed_time(send_start, send_end));
    int num_blocks = ceil((double) Nparticles / (double) threads_per_block);
    printf("threads_per_block=%d \n",threads_per_block);
    size_t local_work[3] = {threads_per_block, 1, 1};
    size_t global_work[3] = {num_blocks*threads_per_block, 1, 1};

    for (k = 1; k < Nfr; k++) {
        /****************** L I K E L I H O O D ************************************/
        clSetKernelArg(kernel_likelihood, 0, sizeof (void *), (void*) &arrayX_GPU);
        clSetKernelArg(kernel_likelihood, 1, sizeof (void *), (void*) &arrayY_GPU);
        clSetKernelArg(kernel_likelihood, 2, sizeof (void *), (void*) &xj_GPU);
        clSetKernelArg(kernel_likelihood, 3, sizeof (void *), (void*) &yj_GPU);
        clSetKernelArg(kernel_likelihood, 4, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_likelihood, 5, sizeof (void *), (void*) &ind_GPU);
        clSetKernelArg(kernel_likelihood, 6, sizeof (void *), (void*) &objxy_GPU);
        clSetKernelArg(kernel_likelihood, 7, sizeof (void *), (void*) &likelihood_GPU);
        clSetKernelArg(kernel_likelihood, 8, sizeof (void *), (void*) &I_GPU);
        clSetKernelArg(kernel_likelihood, 9, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_likelihood, 10, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_likelihood, 11, sizeof (cl_int), (void*) &Nparticles);
        clSetKernelArg(kernel_likelihood, 12, sizeof (cl_int), (void*) &countOnes);
        clSetKernelArg(kernel_likelihood, 13, sizeof (cl_int), (void*) &max_size);
        clSetKernelArg(kernel_likelihood, 14, sizeof (cl_int), (void*) &k);
        clSetKernelArg(kernel_likelihood, 15, sizeof (cl_int), (void*) &IszY);
        clSetKernelArg(kernel_likelihood, 16, sizeof (cl_int), (void*) &Nfr);
        clSetKernelArg(kernel_likelihood, 17, sizeof (void *), (void*) &seed_GPU);
        clSetKernelArg(kernel_likelihood, 18, sizeof (void *), (void*) &partial_sums);
        clSetKernelArg(kernel_likelihood, 19, threads_per_block * sizeof (double), NULL);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_likelihood, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(kernel_likelihood)=>%d failed\n", err);
	    //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /****************** E N D    L I K E L I H O O D **********************/
        /*************************** S U M ************************************/
        clSetKernelArg(kernel_sum, 0, sizeof (void *), (void*) &partial_sums);
        clSetKernelArg(kernel_sum, 1, sizeof (cl_int), (void*) &Nparticles);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_sum, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(kernel_sum)=>%d failed\n", err);
	    //check_error(err, __FILE__, __LINE__);
            return -1;
        }/*************************** E N D   S U M ****************************/



        /**************** N O R M A L I Z E     W E I G H T S *****************/
        clSetKernelArg(kernel_normalize_weights, 0, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_normalize_weights, 1, sizeof (cl_int), (void*) &Nparticles);
        clSetKernelArg(kernel_normalize_weights, 2, sizeof (void *), (void*) &partial_sums); //*/
        clSetKernelArg(kernel_normalize_weights, 3, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_normalize_weights, 4, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_normalize_weights, 5, sizeof (void *), (void*) &seed_GPU);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_normalize_weights, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(normalize_weights)=>%d failed\n", err);
	    //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /************* E N D    N O R M A L I Z E     W E I G H T S ***********/

	  //	ocl_print_double_array(cmd_queue, partial_sums, 40);
        // /********* I N T E R M E D I A T E     R E S U L T S ***************/
        // //OpenCL memory copying back from GPU to CPU memory
         err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0);
         err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0);
         err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);

         xe = 0;
         ye = 0;
         double total=0.0;
         // estimate the object location by expected values
	 for (x = 0; x < Nparticles; x++) {
            // if( 0.0000000 < arrayX[x]*weights[x]) printf("arrayX[%d]:%f, arrayY[%d]:%f, weights[%d]:%0.10f\n",x,arrayX[x], x, arrayY[x], x, weights[x]);
	//	printf("arrayX[%d]:%f | arrayY[%d]:%f | weights[%d]:%f\n",
 	//		x, arrayX[x], x, arrayY[x], x, weights[x]); 
             xe += arrayX[x] * weights[x];
             ye += arrayY[x] * weights[x];
             total+= weights[x];
         }
         printf("total weight: %lf\n", total);
         printf("XE: %lf\n", xe);
         printf("YE: %lf\n", ye);
         double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2));
         printf("%lf\n", distance);
        // /********* E N D    I N T E R M E D I A T E     R E S U L T S ***************/

        /******************** F I N D    I N D E X ****************************/
        //Set number of threads

        clSetKernelArg(kernel_find_index, 0, sizeof (void *), (void*) &arrayX_GPU);
        clSetKernelArg(kernel_find_index, 1, sizeof (void *), (void*) &arrayY_GPU);
        clSetKernelArg(kernel_find_index, 2, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_find_index, 3, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_find_index, 4, sizeof (void *), (void*) &xj_GPU);
        clSetKernelArg(kernel_find_index, 5, sizeof (void *), (void*) &yj_GPU);
        clSetKernelArg(kernel_find_index, 6, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_find_index, 7, sizeof (cl_int), (void*) &Nparticles);
        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_find_index, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(find_index)=>%d failed\n", err);
	    //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /******************* E N D    F I N D    I N D E X ********************/

    }//end loop

    //block till kernels are finished
    //clFinish(cmd_queue);
    long long back_time = get_time();

    //OpenCL freeing of memory
    clReleaseProgram(prog);
    clReleaseMemObject(u_GPU);
    clReleaseMemObject(CDF_GPU);
    clReleaseMemObject(yj_GPU);
    clReleaseMemObject(xj_GPU);
    clReleaseMemObject(likelihood_GPU);
    clReleaseMemObject(I_GPU);
    clReleaseMemObject(objxy_GPU);
    clReleaseMemObject(ind_GPU);
    clReleaseMemObject(seed_GPU);
    clReleaseMemObject(partial_sums);

    long long free_time = get_time();

    //OpenCL memory copying back from GPU to CPU memory
    err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long arrayX_time = get_time();
    err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long arrayY_time = get_time();
    err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long back_end_time = get_time();

    printf("GPU Execution: %lf\n", elapsed_time(send_end, back_time));
    printf("FREE TIME: %lf\n", elapsed_time(back_time, free_time));
    printf("SEND TO SEND BACK: %lf\n", elapsed_time(back_time, back_end_time));
    printf("SEND ARRAY X BACK: %lf\n", elapsed_time(free_time, arrayX_time));
    printf("SEND ARRAY Y BACK: %lf\n", elapsed_time(arrayX_time, arrayY_time));
    printf("SEND WEIGHTS BACK: %lf\n", elapsed_time(arrayY_time, back_end_time));

    xe = 0;
    ye = 0;
    // estimate the object location by expected values
    for (x = 0; x < Nparticles; x++) {
        xe += arrayX[x] * weights[x];
        ye += arrayY[x] * weights[x];
    }
    double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2));

    //Output results
    FILE *fid;
    fid=fopen("output.txt", "w+");
    if( fid == NULL ){
      printf( "The file was not opened for writing\n" );
      return -1;
    }
    fprintf(fid, "XE: %lf\n", xe);
    fprintf(fid, "YE: %lf\n", ye);
    fprintf(fid, "distance: %lf\n", distance);
    fclose(fid);


    //OpenCL freeing of memory
    clReleaseMemObject(weights_GPU);
    clReleaseMemObject(arrayY_GPU);
    clReleaseMemObject(arrayX_GPU);

    //free regular memory
    free(likelihood);
    free(arrayX);
    free(arrayY);
    free(xj);
    free(yj);
    free(CDF);
    free(ind);
    free(u);
}
void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)
{
    cl_int ciErrNum;

	if (!m_cxMainContext)
	{
		
		cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
		m_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0, 0);
	
		int numDev = btOpenCLUtils::getNumDevices(m_cxMainContext);
		if (!numDev)
		{
			btAssert(0);
			exit(0);//this is just a demo, exit now
		}

		m_cdDevice =  btOpenCLUtils::getDevice(m_cxMainContext,0);
    	oclCHECKERROR(ciErrNum, CL_SUCCESS);

		btOpenCLDeviceInfo clInfo;
		btOpenCLUtils::getDeviceInfo(m_cdDevice,clInfo);
		btOpenCLUtils::printDeviceInfo(m_cdDevice);

		// create a command-queue
		m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
	// Program Setup
	size_t program_length;


#ifdef LOAD_FROM_MEMORY
	program_length = strlen(source);
	printf("OpenCL compiles ParticlesOCL.cl ... ");
#else

	const char* fileName = "ParticlesOCL.cl";
	FILE * fp = fopen(fileName, "rb");
	char newFileName[512];
	
	if (fp == NULL)
	{
		sprintf(newFileName,"..//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
	}
	
	if (fp == NULL)
	{
		sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
	}

	if (fp == NULL)
	{
		sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
		else
		{
			printf("cannot find %s\n",newFileName);
			exit(0);
		}
	}

//	char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length);
	//char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length);

	char *source = btOclLoadProgSource(fileName, "", &program_length);
	if(source == NULL)
	{
		printf("ERROR : OpenCL can't load file %s\n", fileName);
	}
//	oclCHECKERROR (source == NULL, oclFALSE);   
	btAssert(source != NULL);

	// create the program
	printf("OpenCL compiles %s ...", fileName);

#endif //LOAD_FROM_MEMORY


	//printf("%s\n", source);

	m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);
#ifndef LOAD_FROM_MEMORY
	free(source);
#endif //LOAD_FROM_MEMORY

	//#define LOCAL_SIZE_LIMIT 1024U
#define LOCAL_SIZE_MAX 1024U

		    // Build the program with 'mad' Optimization option
#ifdef MAC
	const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -cl-mad-enable -DMAC -DGUID_ARG";
#else
	const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -DGUID_ARG= ";
#endif
	// build the program
	ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);
	if(ciErrNum != CL_SUCCESS)
	{
		// write out standard error
//		oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
		// write out the build log and ptx, then exit
		char cBuildLog[10240];
//		char* cPtx;
//		size_t szPtxLength;
		clGetProgramBuildInfo(m_cpProgram, m_cdDevice, CL_PROGRAM_BUILD_LOG, 
							  sizeof(cBuildLog), cBuildLog, NULL );
//		oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength);
//		oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx);
		printf("\n\n%s\n\n\n", cBuildLog);
		printf("Press ENTER key to terminate the program\n");
		getchar();
		exit(-1); 
	}
	printf("OK\n");

	// create the kernels

	postInitDeviceData();

	initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId");
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);


	initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int),		(void *) &m_numGridCells);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dCellStart);

	initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart");
//	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int),	(void*) &m_numParticles);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dPosHash);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dCellStart);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem),	(void*) &m_dVel);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem),	(void*) &m_dSortedPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem),	(void*) &m_dSortedVel);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dVel);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dSortedPos);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dSortedVel);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem),	(void*) &m_dPosHash);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem),	(void*) &m_dCellStart);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem),	(void*) &m_dSimParams);

	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal");
}
Example #5
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
    
    float data[DATA_SIZE];              // original data set given to device
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned
    
    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation
    
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    // Fill our data set with random float values
    //
    int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;
    
    // Connect to a compute device
    //
    int gpu = 1;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    
    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }
    
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }
    
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }
    
    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }
    
    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }
    
    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }
    
    // Write our data set into the input array in device memory
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }
    
    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }
    
    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    
    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }
    
    // Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);
    
    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
    
    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        if(results[i] == data[i] * data[i])
            correct++;
    }
    
    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);
    
    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    
    return 0;
}
int main(int argc, char** argv)
{
    int          rank, size;         // MPI rank & size
    int          err;                // error code returned from OpenCL calls
    float        h_a[LENGTH];        // a vector
    float        h_b[LENGTH];        // b vector
    float        h_c[LENGTH];        // c vector (a+b) returned from the compute device (local per task)
    float        _h_c[LENGTH];       // c vector (a+b) returned from the compute device (global for master)
    unsigned int correct;            // number of correct results

    size_t global;                   // global domain size
    size_t local;                    // local  domain size

    cl_device_id     device_id;      // compute device id
    cl_context       context;        // compute context
    cl_command_queue commands;       // compute command queue
    cl_program       program;        // compute program
    cl_kernel        ko_vadd;        // compute kernel

    cl_mem d_a;                      // device memory used for the input  a vector
    cl_mem d_b;                      // device memory used for the input  b vector
    cl_mem d_c;                      // device memory used for the output c vector

    int mycount, i;

    err = MPI_Init (&argc, &argv);

    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Init failed!\n");
        exit (-1);
    }

    err = MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_rank failed!\n");
        exit (-1);
    }

    err = MPI_Comm_size (MPI_COMM_WORLD, &size);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_size failed\n");
        exit (-1);
    }

    if (LENGTH % size != 0)
    {
        printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH);
        exit (-1);
    }

    mycount = LENGTH / size;

    if (rank == 0)
    {
        for (i = 0; i < LENGTH; i++)
        {
            h_a[i] = rand() / (float)RAND_MAX;
            h_b[i] = rand() / (float)RAND_MAX;
            h_a[i] = i;
            h_b[i] = i*2;
        }
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_b\n");
            exit (-1);
        }
    }
    else
    {
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_b\n");
            exit (-1);
        }
    }

    // Set up platform
    cl_uint numPlatforms;

    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n");
        return EXIT_FAILURE;
    }

    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n");
        return EXIT_FAILURE;
    }

    // Secure a GPU
    for (i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }

    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    else
    {
        if (output_device_info (rank, device_id) != CL_SUCCESS)
            return EXIT_FAILURE;
    }

    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel from the program
    ko_vadd = clCreateKernel(program, "vadd", &err);
    if (!ko_vadd || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

    // Create the input (a, b) and output (c) arrays in device memory
    d_a = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_b = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_c = clCreateBuffer(context,  CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL);
    if (!d_a || !d_b || !d_c)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }

    // Write a and b vectors into compute device memory
    err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_a to source array!\n");
        exit(1);
    }

    err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_b to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    global = LENGTH;
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    // Wait for the commands to complete before reading back results
    clFinish(commands);

    // Read back the results from the compute device
    err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }

    err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Gather failed receiving h_c\n");
        exit (-1);
    }

    if (rank == 0)
    {
        // Test the results
        correct = 0;
        float tmp;

        for(i = 0; i < LENGTH; i++)
        {
            tmp = h_a[i] + h_b[i];     // assign element i of a+b to tmp
            tmp -= _h_c[i];             // compute deviation of expected and output result
            if(tmp*tmp < TOL*TOL)      // correct if square deviation is less than tolerance squared
                correct++;
            else
                printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]);
        }

        // summarize results
        printf("C = A+B:  %d out of %d results were correct.\n", correct, LENGTH);
    }

    // cleanup then shutdown
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(ko_vadd);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    err = MPI_Finalize ();
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Finalize failed!\n");
        exit (-1);
    }

    return 0;
}
Example #7
0
void initopencl(void) {
	
	int i;


	// Get Platform and Device Info
	CL_CHECK(clGetPlatformIDs(1, &platform_id, &num_platforms));
	
	// Currently this program only runs on a SINGLE GPU.
	CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices));
	printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
	printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
	
	
	char buffer[10240];
	cl_uint buf_uint;
	cl_ulong buf_ulong;
	printf("  -- %d --\n", i);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_NAME = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_VENDOR = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_VERSION = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
	printf("  DRIVER_VERSION = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
	printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
	printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
	printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);

	if (num_devices == 0)
	{	
		fprintf(stderr, "No Devices found that can run OpenCL.");
		exit(0);	
	}
	// Create OpenCL context
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
	if (ret != CL_SUCCESS) {
		
		fprintf(stderr, "Error creating context: Function returned %d \n\n", ret);
		exit(1);
	
	}
	// Create Command Queue
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
	if (ret != CL_SUCCESS) {
		
		fprintf(stderr, "Error creating command Queue: Function returned %d \n\n", ret);
		exit(1);
	
	}
	
	// Load the kernel source code into the array source_str
	FILE *fp;
	char *source_str;
	size_t source_size;
	
	fp = fopen("integrate.cl", "r");
	if (!fp) {
	    fprintf(stderr, "Failed to load kernel.\n");
	    exit(1);
	}
	
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose( fp );	
	
	
	// Create a program from the kernel source
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);	
    if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a program for integration3D. %d \n\n", (int)ret);
			exit(1);
	}
    // Build the program
    
    ret = clBuildProgram(program, 1, &device_id, "-DUSE_DOUBLE=1", NULL, NULL); 
    if (ret != CL_SUCCESS)
    {
    
    	size_t length;
    	char buffer[10240];
    	clGetProgramBuildInfo(program, 1, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
    	fprintf(stderr, "Error returned %d. \n\n", (int)ret);
    	printf("Error Log: \n\n %s \n\n", buffer);
    	exit(0);
    }
	
/*    // Create the OpenCL kernel (compute_points_Unstructure3D_1)
    kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n");
			exit(1);
	}
*/	
	// Create the OpenCL kernel (check_int)
    kernel2 = clCreateKernel(program, "check_int", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for check_int. %d \n\n", (int)ret);
			exit(1);
	}
	
    
    // Create the OpenCL kernel (compute_points_Unstructure3D_1)
    kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
	kernel3 = clCreateKernel(program, "initialize_timestep3D", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for initialize_timestep3D. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
    kernel4 = clCreateKernel(program, "LocalSearch3D", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
    kernel5 = clCreateKernel(program, "compute_points_Unstructure3D_2", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n");
			exit(1);
	}
	
	
	
	printf("\n\n");
}
void OpenCLWaveSimulation::initOCL()
{
    const unsigned int vboSize = m_gridWidth * m_gridHeight * VERTEX_SIZE * sizeof(float);

	// first get number of available platt forms
	cl_uint numPlattforms = 0;
	clGetPlatformIDs(0, NULL, &numPlattforms);

	// then allocate enough memory for all IDs
	cl_platform_id* plattforms = new cl_platform_id[numPlattforms];
    clGetPlatformIDs(numPlattforms, plattforms, NULL);

	cl_device_type deviceType = (m_argc <= 1) ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;

	// now try to find the right plattform for given device type
	for (unsigned int i = 0; i < numPlattforms; ++i) 
	{
		clGetDeviceIDs(plattforms[i], deviceType, 1, &m_device, NULL);

		if (m_device)
		{
			m_platform = plattforms[i];
			break;
		}
	}
	delete plattforms;
   

    // ocl context must be tied to the ogl context
#   ifdef _WIN32
        HGLRC glCtx = wglGetCurrentContext();
#   else
        GLXContext glCtx = glXGetCurrentContext();
#   endif

    cl_context_properties props[] = {CL_CONTEXT_PLATFORM,
        (cl_context_properties)m_platform,
#   ifdef _WIN32
        CL_WGL_HDC_KHR, (intptr_t) wglGetCurrentDC(),
#   else
        CL_GLX_DISPLAY_KHR, (intptr_t) glXGetCurrentDisplay();
#   endif
    CL_GL_CONTEXT_KHR, (intptr_t) glCtx, 0
    };

    // create context and queue
    m_context = clCreateContext(props, 1, &m_device, NULL, NULL, NULL);
    m_queue = clCreateCommandQueue(m_context, m_device, 0, NULL);

    // create buffers
    int errCode;
    m_clPositionInteropBuffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, m_positionVBO, &errCode);
    if(errCode != CL_SUCCESS)
    {
        std::cerr << "Failed creating cl_mem position buffer from gl buffer\n";
    }

    m_clNormalInteropBuffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, m_normalVBO, &errCode);
    if(errCode != CL_SUCCESS)
    {
        std::cerr << "Failed creating cl_mem normal buffer from gl buffer\n";
    }

    m_clTangentInteropBuffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, m_tangentVBO, &errCode);
    if(errCode != CL_SUCCESS)
    {
        std::cerr << "Failed creating cl_mem tangent buffer from gl buffer\n";
    }

    m_clPing = clCreateBuffer(m_context,
                              CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                              vboSize,
                              reinterpret_cast<float*>(m_waves.getVertices()),
                              &errCode);
    if(errCode != CL_SUCCESS)
    {
        std::cerr << "Failed creating cl_mem read write buffer\n";
    }

    m_clPong = clCreateBuffer(m_context,
                              CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                              vboSize,
                              reinterpret_cast<float*>(m_waves.getVertices()),
                              &errCode);
    if(errCode != CL_SUCCESS)
    {
        std::cerr << "Failed creating cl_mem read write buffer\n";
    }

    // load source file
    std::ifstream file("WaveSimulation.cl");
    std::string prog(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>()));
    file.close();

    const char* source = prog.c_str();
    const size_t kernelsize = prog.length() + 1;
    m_program = clCreateProgramWithSource(m_context, 1, (const char**)&source, &kernelsize, NULL);

    // build program
    int err = clBuildProgram(m_program, 0, NULL, NULL, NULL, NULL);
    if(err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        std::cerr << "Error: Failed to build program executable!" << std::endl;
        clGetProgramBuildInfo(m_program, m_device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        std::cerr << buffer << std::endl;
        exit(1);
    }

    // create the compute kernels in the program
    m_vertexDisplacementKernel = clCreateKernel(m_program, "compute_vertex_displacement", &err);
    if(!m_vertexDisplacementKernel || err != CL_SUCCESS)
    {
        std::cerr << "Error: Failed to create compute kernel: compute_vertex_displacement!" << std::endl;
        exit(1);
    }

    m_finiteDifferenceSchemeKernel = clCreateKernel(m_program, "compute_finite_difference_scheme", &err);
    if(!m_finiteDifferenceSchemeKernel || err != CL_SUCCESS)
    {
        std::cerr << "Error: Failed to create compute kernel: compute_finite_difference_scheme!" << std::endl;
        exit(1);
    }

    m_disturbKernel = clCreateKernel(m_program, "disturb_grid", &err);
    if(!m_disturbKernel || err != CL_SUCCESS)
    {
        std::cerr << "Error: Failed to create compute kernel: disturb_grid!" << std::endl;
        exit(1);
    }

    m_glGridInitKernel = clCreateKernel(m_program, "initialize_gl_grid", &err);
    if(!m_glGridInitKernel || err != CL_SUCCESS)
    {
        std::cerr << "Error: Failed to create compute kernel: initialize_gl_grid!" << std::endl;
        exit(1);
    }
    initGLBuffer();
}
void BurstSort::parallelSort(std::ofstream& file){
	char* buffer = NULL;
	char* tmp;
	int* posArray = NULL;
	int entryLength = KEY_LENGTH + sizeof(char*);
	buffer = (char*) malloc(sizeof(char) * size * entryLength);
	posArray = (int*) malloc(sizeof(int) * (NODE_SIZE + 1));
	int pos = 0;
	posArray[0] = 0;
	for(int i = 0; i < NODE_SIZE; i++){
		for(int j = 0; j < nodes[i].used; j++){
			memcpy(buffer + pos * entryLength, nodes[i].entries[j], KEY_LENGTH * sizeof(char));
			memcpy(buffer + pos * entryLength + KEY_LENGTH, &nodes[i].entries[j], sizeof(char*));
			pos += sizeof(char);
		}
		posArray[i+1] = pos;
	}

	// OpenCL
	// Use this to check the output of each API call
    cl_int status;  
	cl_int numDevices = 1;
	
	// Connect to first platform
    cl_platform_id platform;
    status = clGetPlatformIDs(1, &platform, NULL);

	if (status != CL_SUCCESS) {
		printf("Error: Failed to find an OpenCL platform!\n");
		return -1;
	}
 
	char cBuffer[1024];
	clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(cBuffer), cBuffer, NULL);
	printf("CL_PLATFORM_VENDOR %s\n", cBuffer);

	clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
	printf("CL_PLATFORM_NAME %s\n", cBuffer);

    cl_device_id device;
	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 1, &device, NULL);

	if (status != CL_SUCCESS) {
		printf("Error: Failed to create a device group!\n");
		return -1;
	}

	cl_long maxBufferSize = 0;
	status = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_long), &maxBufferSize, NULL);
	printf("max buffer size: %lld\n", maxBufferSize);

    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, &device, NULL, NULL, &status);
	

	if (status != CL_SUCCESS) {
		printf("Error in creating context, code %d\n", status);
		return -1;
	}
    // Create a command queue and associate it with the device 
    cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context, device, 0, &status);

	if (status != CL_SUCCESS) {
		printf("Error in creating command queue for a device, code %d\n", status);
		return -1;
	}

	// Load binary from disk
	unsigned char *kernelbinary;
	char *xclbin = "sort_xiaohui.xclbin";
	printf("loading %s\n", xclbin);
	int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
	if (n_i < 0) {
		printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin);
		return -1;
	}
	size_t n_bit = n_i;

	// Create the compute program from offline
	cl_program program = clCreateProgramWithBinary(context, 1, &device, &n_bit,
			(const unsigned char **) &kernelbinary, NULL, &status);
	if ((!program) || (status != CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n", status);
		return -1;
	}

	// Build the program executable
	status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	if (status != CL_SUCCESS) {
		size_t len;
		char buffer[2048];

		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		return -1;
	}

	// Create the vector addition kernel
    cl_kernel kernel;
    kernel = clCreateKernel(program, "sort", &status);


	cl_mem clPosArray;
	cl_mem clBuffer;
	clBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, 
		sizeof(char) * size * entryLength, NULL, &status);

	clPosArray = clCreateBuffer(context, CL_MEM_READ_ONLY, 
		sizeof(int) * (NODE_SIZE + 1), NULL, &status);
	
	status = clEnqueueWriteBuffer(cmdQueue, clPosArray, CL_FALSE, 
		0, sizeof(int) * (NODE_SIZE + 1),posArray, 0, NULL, NULL);

	status = clEnqueueWriteBuffer(cmdQueue, clBuffer, CL_FALSE, 
		0, sizeof(char) * size * entryLength, buffer, 0, NULL, NULL);


    // Associate the input and output buffers with the kernel 
	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &clBuffer);

	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &clPosArray);

	int nodeSize = NODE_SIZE;
	status = clSetKernelArg(kernel, 2, sizeof(int), (void *)&nodeSize);

	status = clSetKernelArg(kernel, 3, sizeof(int), (void *)&entryLength);

	size_t globalWorkSize[1];   

	globalWorkSize[0] = NODE_SIZE;

    gettimeofday(&t1, NULL);
	// Execute the kernel for execution
    status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);

	if (status != CL_SUCCESS) {
		printf("Error in clEnqueue, code %d\n", status);
		return -1;
	}


    // Read the device output buffer to the host output array
	clEnqueueReadBuffer(cmdQueue, clBuffer, CL_TRUE, 0, 
		sizeof(char) * size * entryLength, buffer, 0, NULL, NULL);

    // Free OpenCL resources
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(cmdQueue);
	clReleaseMemObject(clBuffer);
	clReleaseMemObject(clPosArray);
	clReleaseContext(context);

    //print result
	for(int i = 0; i < size; i+= sizeof(char)){
		memcpy(&tmp,buffer + i * entryLength + KEY_LENGTH,sizeof(char*));
		file << tmp;
	}

    // Free host resources
	free(buffer);
	free(posArray);

	free(platforms);
	free(devices);

}
Example #10
0
int deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree,
  unsigned tree_len, unsigned char* output, unsigned* out_len)
{
#define SDACCEL_WRAPPER
#ifdef SDACCEL_WRAPPER
  int err;                            // error code returned from api calls

  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel
   
  char cl_platform_vendor[1001];
  char cl_platform_name[1001];

  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);

  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create a command commands
  //
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char xclbin[]="deflate1.xclbin";
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "deflate259", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create the input and output arrays in device memory for our calculation
//  void deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree,
//    unsigned tree_len, unsigned char* output, unsigned* out_len)
  cl_mem input_arg, in_len_arg, tree_arg, tree_len_arg, output_arg, out_len_arg;
  input_arg = clCreateBuffer(context,  CL_MEM_READ_ONLY, CHUNK, NULL, NULL);
  in_len_arg = clCreateBuffer(context,  CL_MEM_READ_ONLY, sizeof(unsigned), NULL, NULL);
  tree_arg = clCreateBuffer(context,  CL_MEM_READ_ONLY, 512, NULL, NULL);
  tree_len_arg = clCreateBuffer(context,  CL_MEM_READ_ONLY, sizeof(unsigned), NULL, NULL);
  output_arg = clCreateBuffer(context, CL_MEM_WRITE_ONLY, CHUNK*2, NULL, NULL);
  out_len_arg = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(unsigned), NULL, NULL);

  if (!input_arg || !in_len_arg || !tree_arg || !tree_len_arg || !output_arg || !out_len_arg)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }    

  err = clEnqueueWriteBuffer(commands, input_arg, CL_TRUE, 0, in_len, input, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array input!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  err = clEnqueueWriteBuffer(commands, in_len_arg, CL_TRUE, 0, sizeof(unsigned), &in_len, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array &in_len!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  err = clEnqueueWriteBuffer(commands, tree_arg, CL_TRUE, 0, 512, tree, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array tree!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  err = clEnqueueWriteBuffer(commands, tree_len_arg, CL_TRUE, 0, sizeof(unsigned), &tree_len, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array &tree_len!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Set the arguments to our compute kernel
//void deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree,
//  unsigned tree_len, unsigned char* output, unsigned* out_len)
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_arg);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_len_arg);
  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &tree_arg);
  err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &tree_len_arg);
  err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_arg);
  err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &out_len_arg);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

#ifdef C_KERNEL
  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);
#else
  size_t global[1];
  size_t local[1];
  global[0] = 1;
  local[0] = 1;
  err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, 
                               (size_t*)&global, (size_t*)&local, 0, NULL, NULL);
#endif
  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  unsigned out_len_b;
  err = clEnqueueReadBuffer( commands, out_len_arg, CL_TRUE, 0, sizeof(unsigned),
      &out_len_b, 0, NULL, &readevent );
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output length! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  clWaitForEvents(1, &readevent);
  *out_len = out_len_b;

  printf("Read final output length: %d\n", out_len_b);

  err = clEnqueueReadBuffer( commands, output_arg, CL_TRUE, 0, out_len_b, output, 0, NULL, &readevent );
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output data! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  clWaitForEvents(1, &readevent);
#endif
}
Example #11
0
_clState *initCl(unsigned int gpu, char *name, size_t nameSize)
{
	_clState *clState = calloc(1, sizeof(_clState));
	bool patchbfi = false, prog_built = false;
	struct cgpu_info *cgpu = &gpus[gpu];
	cl_platform_id platform = NULL;
	char pbuff[256], vbuff[255];
	cl_platform_id* platforms;
	cl_uint preferred_vwidth;
	cl_device_id *devices;
	cl_uint numPlatforms;
	cl_uint numDevices;
	cl_int status;

	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status);
		return NULL;
	}

	platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
	status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status);
		return NULL;
	}

	if (opt_platform_id >= (int)numPlatforms) {
		applog(LOG_ERR, "Specified platform that does not exist");
		return NULL;
	}

	status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status);
		return NULL;
	}
	platform = platforms[opt_platform_id];

	if (platform == NULL) {
		perror("NULL platform found!\n");
		return NULL;
	}

	applog(LOG_INFO, "CL Platform vendor: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform name: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform version: %s", vbuff);

	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status);
		return NULL;
	}

	if (numDevices > 0 ) {
		devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

		/* Now, get the device list data */

		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status);
			return NULL;
		}

		applog(LOG_INFO, "List of devices:");

		unsigned int i;
		for (i = 0; i < numDevices; i++) {
			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error %d: Getting Device Info", status);
				return NULL;
			}

			applog(LOG_INFO, "\t%i\t%s", i, pbuff);
		}

		if (gpu < numDevices) {
			status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error %d: Getting Device Info", status);
				return NULL;
			}

			applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
			strncpy(name, pbuff, nameSize);
		} else {
			applog(LOG_ERR, "Invalid GPU %i", gpu);
			return NULL;
		}

	} else return NULL;

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };

	clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status);
		return NULL;
	}

	/////////////////////////////////////////////////////////////////
	// Create an OpenCL command queue
	/////////////////////////////////////////////////////////////////
	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
	if (status != CL_SUCCESS) /* Try again without OOE enable */
		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
		return NULL;
	}

	/* Check for BFI INT support. Hopefully people don't mix devices with
	 * and without it! */
	char * extensions = malloc(1024);
	const char * camo = "cl_amd_media_ops";
	char *find;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status);
		return NULL;
	}
	find = strstr(extensions, camo);
	if (find)
		clState->hasBitAlign = true;
		
	/* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
	char * devoclver = malloc(1024);
	const char * ocl10 = "OpenCL 1.0";
	const char * ocl11 = "OpenCL 1.1";

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status);
		return NULL;
	}
	find = strstr(devoclver, ocl10);
	if (!find) {
		clState->hasOpenCL11plus = true;
		find = strstr(devoclver, ocl11);
		if (!find)
			clState->hasOpenCL12plus = true;
	}

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size));

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Max mem alloc size is %lu", (long unsigned int)(cgpu->max_alloc));

	/* Create binary filename based on parameters passed to opencl
	 * compiler to ensure we only load a binary that matches what would
	 * have otherwise created. The filename is:
	 * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
	 * For scrypt the filename is:
	 * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin
	 */
	char binaryfilename[255];
	char filename[255];
	char numbuf[16];

	if (cgpu->kernel == KL_NONE) {
		if (opt_scrypt) {
			applog(LOG_INFO, "Selecting scrypt kernel");
			clState->chosen_kernel = KL_SCRYPT;
		} else if (opt_blake256) {
			applog(LOG_INFO, "Selecting blake256 kernel");
			clState->chosen_kernel = KL_BLAKE256;
		} else if (!strstr(name, "Tahiti") &&
			/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
			 strstr(vbuff, "831.4") ||
			 strstr(vbuff, "898.1") ||  // 12.2 driver SDK 
			 strstr(vbuff, "923.1") ||  // 12.4
			 strstr(vbuff, "938.2") ||  // SDK 2.7
			 strstr(vbuff, "1113.2"))) {// SDK 2.8
				applog(LOG_INFO, "Selecting diablo kernel");
				clState->chosen_kernel = KL_DIABLO;
		/* Detect all 7970s, older ATI and NVIDIA and use poclbm */
		} else if (strstr(name, "Tahiti") || !clState->hasBitAlign) {
			applog(LOG_INFO, "Selecting poclbm kernel");
			clState->chosen_kernel = KL_POCLBM;
		/* Use phatk for the rest R5xxx R6xxx */
		} else {
			applog(LOG_INFO, "Selecting phatk kernel");
			clState->chosen_kernel = KL_PHATK;
		}
		cgpu->kernel = clState->chosen_kernel;
	} else {
		clState->chosen_kernel = cgpu->kernel;
		if (clState->chosen_kernel == KL_PHATK &&
		    (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") ||
		     strstr(vbuff, "831.4") || strstr(vbuff, "898.1") ||
		     strstr(vbuff, "923.1") || strstr(vbuff, "938.2") ||
		     strstr(vbuff, "1113.2"))) {
			applog(LOG_WARNING, "WARNING: You have selected the phatk kernel.");
			applog(LOG_WARNING, "You are running SDK 2.6+ which performs poorly with this kernel.");
			applog(LOG_WARNING, "Downgrade your SDK and delete any .bin files before starting again.");
			applog(LOG_WARNING, "Or allow cgminer to automatically choose a more suitable kernel.");
		}
	}

	/* For some reason 2 vectors is still better even if the card says
	 * otherwise, and many cards lie about their max so use 256 as max
	 * unless explicitly set on the command line. Tahiti prefers 1 */
	if (strstr(name, "Tahiti"))
		preferred_vwidth = 1;
	else if (preferred_vwidth > 2)
		preferred_vwidth = 2;

	switch (clState->chosen_kernel) {
		case KL_POCLBM:
			strcpy(filename, POCLBM_KERNNAME".cl");
			strcpy(binaryfilename, POCLBM_KERNNAME);
			break;
		case KL_PHATK:
			strcpy(filename, PHATK_KERNNAME".cl");
			strcpy(binaryfilename, PHATK_KERNNAME);
			break;
		case KL_DIAKGCN:
			strcpy(filename, DIAKGCN_KERNNAME".cl");
			strcpy(binaryfilename, DIAKGCN_KERNNAME);
			break;
		case KL_SCRYPT:
			strcpy(filename, SCRYPT_KERNNAME".cl");
			strcpy(binaryfilename, SCRYPT_KERNNAME);
			/* Scrypt only supports vector 1 */
			cgpu->vwidth = 1;
			break;
		case KL_BLAKE256:
			strcpy(filename, BLAKE256_KERNNAME".cl");
			strcpy(binaryfilename, BLAKE256_KERNNAME);
			break;
		case KL_NONE: /* Shouldn't happen */
		case KL_DIABLO:
			strcpy(filename, DIABLO_KERNNAME".cl");
			strcpy(binaryfilename, DIABLO_KERNNAME);
			break;
	}

	if (cgpu->vwidth)
		clState->vwidth = cgpu->vwidth;
	else {
		clState->vwidth = preferred_vwidth;
		cgpu->vwidth = preferred_vwidth;
	}

	if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
		clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt || opt_blake256)
			clState->goffset = true;

	if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
		clState->wsize = cgpu->work_size;
	else if (opt_scrypt)
		clState->wsize = 256;
	else if (strstr(name, "Tahiti"))
		clState->wsize = 64;
	else
		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
	cgpu->work_size = clState->wsize;

#ifdef USE_SCRYPT
	if (opt_scrypt) {
		if (!cgpu->opt_lg) {
			applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
			cgpu->lookup_gap = 2;
		} else
			cgpu->lookup_gap = cgpu->opt_lg;

		if (!cgpu->opt_tc) {
			unsigned int sixtyfours;

			sixtyfours =  cgpu->max_alloc / 131072 / 64 - 1;
			cgpu->thread_concurrency = sixtyfours * 64;
			if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
				cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
				if (cgpu->thread_concurrency > cgpu->shaders * 5)
					cgpu->thread_concurrency = cgpu->shaders * 5;
			}
			applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency));
		} else
			cgpu->thread_concurrency = cgpu->opt_tc;
	}
#endif

	FILE *binaryfile;
	size_t *binary_sizes;
	char **binaries;
	int pl;
	char *source = file_contents(filename, &pl);
	size_t sourceSize[] = {(size_t)pl};
	cl_uint slot, cpnd;

	slot = cpnd = 0;

	if (!source)
		return NULL;

	binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binary_sizes)) {
		applog(LOG_ERR, "Unable to calloc binary_sizes");
		return NULL;
	}
	binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binaries)) {
		applog(LOG_ERR, "Unable to calloc binaries");
		return NULL;
	}

	strcat(binaryfilename, name);
	if (clState->goffset)
		strcat(binaryfilename, "g");
	if (opt_scrypt) {
#ifdef USE_SCRYPT
		sprintf(numbuf, "lg%utc%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency);
		strcat(binaryfilename, numbuf);
#endif
	} else {
		sprintf(numbuf, "v%d", clState->vwidth);
		strcat(binaryfilename, numbuf);
	}
	sprintf(numbuf, "w%d", (int)clState->wsize);
	strcat(binaryfilename, numbuf);
	sprintf(numbuf, "l%d", (int)sizeof(long));
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, ".bin");

	binaryfile = fopen(binaryfilename, "rb");
	if (!binaryfile) {
		applog(LOG_DEBUG, "No binary found, generating from source");
	} else {
		struct stat binary_stat;

		if (unlikely(stat(binaryfilename, &binary_stat))) {
			applog(LOG_DEBUG, "Unable to stat binary, generating from source");
			fclose(binaryfile);
			goto build;
		}
		if (!binary_stat.st_size)
			goto build;

		binary_sizes[slot] = binary_stat.st_size;
		binaries[slot] = (char *)calloc(binary_sizes[slot], 1);
		if (unlikely(!binaries[slot])) {
			applog(LOG_ERR, "Unable to calloc binaries");
			fclose(binaryfile);
			return NULL;
		}

		if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) {
			applog(LOG_ERR, "Unable to fread binaries");
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		fclose(binaryfile);
		applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);

		goto built;
	}

	/////////////////////////////////////////////////////////////////
	// Load CL file, build CL program object, create CL kernel object
	/////////////////////////////////////////////////////////////////

build:
	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
		return NULL;
	}

	/* create a cl program executable for all the devices specified */
	char *CompilerOptions = calloc(1, 256);

#ifdef USE_SCRYPT
	if (opt_scrypt)
		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
			cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize);
	else
#endif
	{
		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
			(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
	}
	applog(LOG_DEBUG, "Setting worksize to %d", (int)(clState->wsize));
	if (clState->vwidth > 1)
		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);

	if (clState->hasBitAlign) {
		strcat(CompilerOptions, " -D BITALIGN");
		applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
		if (!clState->hasOpenCL12plus &&
		    (strstr(name, "Cedar") ||
		     strstr(name, "Redwood") ||
		     strstr(name, "Juniper") ||
		     strstr(name, "Cypress" ) ||
		     strstr(name, "Hemlock" ) ||
		     strstr(name, "Caicos" ) ||
		     strstr(name, "Turks" ) ||
		     strstr(name, "Barts" ) ||
		     strstr(name, "Cayman" ) ||
		     strstr(name, "Antilles" ) ||
		     strstr(name, "Wrestler" ) ||
		     strstr(name, "Zacate" ) ||
		     strstr(name, "WinterPark" )))
			patchbfi = true;
	} else
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");

	if (patchbfi) {
		strcat(CompilerOptions, " -D BFI_INT");
		applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
	} else
		applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");

	if (clState->goffset)
		strcat(CompilerOptions, " -D GOFFSET");

	if (!clState->hasOpenCL11plus)
		strcat(CompilerOptions, " -D OCL1");

	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
	status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);
	free(CompilerOptions);

	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_ERR, "%s", log);
		return NULL;
	}

	prog_built = true;

#ifdef __APPLE__
	/* OSX OpenCL breaks reading off binaries with >1 GPU so always build
	 * from source. */
	goto built;
#endif

	status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
		return NULL;
	}

	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
		return NULL;
	}

	/* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
	 * to iterate over all the binary slots and find where the real program
	 * is. What the heck is this!? */
	for (slot = 0; slot < cpnd; slot++)
		if (binary_sizes[slot])
			break;

	/* copy over all of the generated binaries. */
	applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, (int)(binary_sizes[slot]));
	if (!binary_sizes[slot]) {
		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
		return NULL;
	}
	binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1);
	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
		return NULL;
	}

	/* Patch the kernel if the hardware supports BFI_INT but it needs to
	 * be hacked in */
	if (patchbfi) {
		unsigned remaining = binary_sizes[slot];
		char *w = binaries[slot];
		unsigned int start, length;

		/* Find 2nd incidence of .text, and copy the program's
		* position and length at a fixed offset from that. Then go
		* back and find the 2nd incidence of \x7ELF (rewind by one
		* from ELF) and then patch the opcocdes */
		if (!advance(&w, &remaining, ".text"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, ".text")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		memcpy(&start, w + 285, 4);
		memcpy(&length, w + 289, 4);
		w = binaries[slot]; remaining = binary_sizes[slot];
		if (!advance(&w, &remaining, "ELF"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, "ELF")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		w--; remaining++;
		w += start; remaining -= start;
		applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
			w, remaining);
		patch_opcodes(w, length);

		status = clReleaseProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
			return NULL;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
			return NULL;
		}

		/* Program needs to be rebuilt */
		prog_built = false;
	}

	free(source);

	/* Save the binary to be loaded next time */
	binaryfile = fopen(binaryfilename, "wb");
	if (!binaryfile) {
		/* Not a fatal problem, just means we build it again next time */
		applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
	} else {
		if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) {
			applog(LOG_ERR, "Unable to fwrite to binaryfile");
			return NULL;
		}
		fclose(binaryfile);
	}
built:
	if (binaries[slot])
		free(binaries[slot]);
	free(binaries);
	free(binary_sizes);

	applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d",
	       filename, clState->hasBitAlign ? "" : "out", clState->vwidth, (int)(clState->wsize));

	if (!prog_built) {
		/* create a cl program executable for all the devices specified */
		status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
			size_t logSize;
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

			char *log = malloc(logSize);
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
			applog(LOG_ERR, "%s", log);
			return NULL;
		}
	}

	/* get a kernel object handle for a kernel with the given name */
	clState->kernel = clCreateKernel(clState->program, "search", &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status);
		return NULL;
	}

#ifdef USE_SCRYPT
	if (opt_scrypt) {
		size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0));
		size_t bufsize = 128 * ipt * cgpu->thread_concurrency;

		/* Use the max alloc value which has been rounded to a power of
		 * 2 greater >= required amount earlier */
		if (bufsize > cgpu->max_alloc) {
			applog(LOG_WARNING, "Maximum buffer memory device %d supports says %lu",
						gpu, (long unsigned int)(cgpu->max_alloc));
			applog(LOG_WARNING, "Your scrypt settings come to %d", (int)bufsize);
		}
		applog(LOG_DEBUG, "Creating scrypt buffer sized %d", (int)bufsize);
		clState->padbufsize = bufsize;

		/* This buffer is weird and might work to some degree even if
		 * the create buffer call has apparently failed, so check if we
		 * get anything back before we call it a failure. */
		clState->padbuffer8 = NULL;
		clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
		if (status != CL_SUCCESS && !clState->padbuffer8) {
			applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status);
			return NULL;
		}

		clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
			return NULL;
		}
		clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, SCRYPT_BUFFERSIZE, NULL, &status);
	} else
#endif
	if (opt_blake256) {
		clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
			return NULL;
		}
		clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, SCRYPT_BUFFERSIZE, NULL, &status);
	} else
		clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
		return NULL;
	}

	return clState;
}
Example #12
0
static int init_cladsyn(CSOUND *csound, CLADSYN *p){

  int asize, ipsize, fpsize, err;
  cl_device_id device_ids[32], device_id;             
  cl_context context;                
  cl_command_queue commands;          
  cl_program program;                
  cl_kernel kernel1, kernel2;                 
  cl_uint num = 0, nump =  0;
  cl_platform_id platforms[16];
    uint i;

  if(p->fsig->overlap > 1024)
     return csound->InitError(csound, "overlap is too large\n");



  err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 32, device_ids, &num);
  if (err != CL_SUCCESS){
    clGetPlatformIDs(16, platforms, &nump);
    int devs = 0;
    for(i=0; i < nump && devs < 32; i++){
     char name[128];
     clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 128, name, NULL);
     csound->Message(csound, "available platform[%d] %s\n",i, name);
     err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 32-devs, &device_ids[devs], &num);
    if (err != CL_SUCCESS)
     csound->InitError(csound, "failed to find an OpenCL device! %s \n", cl_error_string(err));
    }
    devs += num;
  }

  
  for(i=0; i < num; i++){
  char name[128];
  cl_device_type type;
  clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 128, name, NULL);
  clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  if(type & CL_DEVICE_TYPE_CPU)
  csound->Message(csound, "available CPU[device %d] %s\n",i, name);
  else  if(type & CL_DEVICE_TYPE_GPU)
  csound->Message(csound, "available GPU[device %d] %s\n",i, name);
  else  if(type & CL_DEVICE_TYPE_ACCELERATOR)
  csound->Message(csound, "available ACCELLERATOR[device %d] %s\n",i, name);
  else 
  csound->Message(csound, "available generic [device %d] %s\n",i, name);;
  }

  // SELECT THE GPU HERE
  if(*p->idev < num)
   device_id = device_ids[(int)*p->idev];
  else
   device_id = device_ids[num-1];

   context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
   if (!context)
     return csound->InitError(csound, "Failed to create a compute context! %s\n", 
                             cl_error_string(err));
  
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
       return csound->InitError(csound, "Failed to create a command commands! %s\n", 
                             cl_error_string(err));
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) &code, NULL, &err);
    if (!program)
       return csound->InitError(csound, "Failed to create compute program! %s\n", 
                             cl_error_string(err));
  
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        csound->Message(csound, "Failed to build program executable! %s\n", 
                             cl_error_string(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        return csound->InitError(csound, "%s\n", buffer);
    }

    kernel1 = clCreateKernel(program, "sample", &err);
    if (!kernel1 || err != CL_SUCCESS)
      return csound->InitError(csound, "Failed to create sample compute kernel! %s\n", 
                             cl_error_string(err));

   kernel2 = clCreateKernel(program, "update", &err);
    if (!kernel2 || err != CL_SUCCESS)
      return csound->InitError(csound,"Failed to create update compute kernel! %s\n", 
                             cl_error_string(err));
 
  char name[128];
  clGetDeviceInfo(device_id, CL_DEVICE_NAME, 128, name, NULL);
  csound->Message(csound, "using device: %s\n",name);

  p->bins = (p->fsig->N)/2;

  if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum;

  p->vsamps = p->fsig->overlap;
  p->threads = p->bins*p->vsamps;
  p->mthreads = (p->bins > p->vsamps ? p->bins : p->vsamps);

  asize =  p->vsamps*sizeof(cl_float);
  ipsize = (p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_long);
  fpsize = p->fsig->N*sizeof(cl_float);

  p->out = clCreateBuffer(context,0, asize, NULL, NULL);
  p->frame =   clCreateBuffer(context, CL_MEM_READ_ONLY, fpsize, NULL, NULL);
  p->ph =  clCreateBuffer(context,0, ipsize, NULL, NULL);
  p->amps =  clCreateBuffer(context,0,(p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_float), NULL, NULL);
 
  // memset needed?

  asize = p->vsamps*sizeof(float);
  if(p->out_.auxp == NULL ||
      p->out_.size < (unsigned long) asize)
    csound->AuxAlloc(csound, asize , &p->out_);

  csound->RegisterDeinitCallback(csound, p, destroy_cladsyn);
  p->count = 0;
  p->context = context;
  p->program = program;
  p->commands = commands;
  p->kernel1 = kernel1;
  p->kernel2 = kernel2;
 
  clGetKernelWorkGroupInfo(p->kernel1, 
       device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs1, NULL);
  clGetKernelWorkGroupInfo(p->kernel2, 
       device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs2, NULL);
 
  p->sr = csound->GetSr(csound); 
  clSetKernelArg(p->kernel1, 0, sizeof(cl_mem), &p->out);
  clSetKernelArg(p->kernel1, 1, sizeof(cl_mem), &p->frame);
  clSetKernelArg(p->kernel1, 2, sizeof(cl_mem), &p->ph);
  clSetKernelArg(p->kernel1, 3, sizeof(cl_mem), &p->amps);
  clSetKernelArg(p->kernel1, 5, sizeof(cl_int), &p->bins);
  clSetKernelArg(p->kernel1, 6, sizeof(cl_int), &p->vsamps);
  clSetKernelArg(p->kernel1, 7, sizeof(cl_float), &p->sr);

  clSetKernelArg(p->kernel2, 0, sizeof(cl_mem), &p->out);
  clSetKernelArg(p->kernel2, 1, sizeof(cl_mem), &p->frame);
  clSetKernelArg(p->kernel2, 2, sizeof(cl_mem), &p->ph);
  clSetKernelArg(p->kernel2, 3, sizeof(cl_mem), &p->amps);
  clSetKernelArg(p->kernel2, 5, sizeof(cl_int), &p->bins);
  clSetKernelArg(p->kernel2, 6, sizeof(cl_int), &p->vsamps);
  clSetKernelArg(p->kernel2, 7, sizeof(cl_float),  &p->sr); 
  return OK;
}
GPUBase::GPUBase(char* source, char* KernelName)
{
	kernelFuncName = KernelName;
	size_t szKernelLength;
	size_t szKernelLengthFilter;
	size_t szKernelLengthSum;
	char* SourceOpenCLShared;
	char* SourceOpenCL;
	iBlockDimX = 16;
	iBlockDimY = 16;

	GPUError = oclGetPlatformID(&cpPlatform);
	CheckError(GPUError);

	cl_uint uiNumAllDevs = 0;

	// Get the number of GPU devices available to the platform
	GPUError = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumAllDevs);
	CheckError(GPUError);
	uiDevCount = uiNumAllDevs;

	// Create the device list
	cdDevices = new cl_device_id [uiDevCount];
	GPUError = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL);
	CheckError(GPUError);

	// Create the OpenCL context on a GPU device
	GPUContext = clCreateContext(0, uiNumAllDevs, cdDevices, NULL, NULL, &GPUError);
	CheckError(GPUError);

	//The command-queue can be used to queue a set of operations (referred to as commands) in order.
	GPUCommandQueue = clCreateCommandQueue(GPUContext, cdDevices[0], 0, &GPUError);
	CheckError(GPUError);

	oclPrintDevName(LOGBOTH, cdDevices[0]);

	// Load OpenCL kernel
	SourceOpenCLShared = oclLoadProgSource("C:\\Dropbox\\MGR\\GPUFeatureExtraction\\GPU\\OpenCL\\GPUCode.cl", "// My comment\n", &szKernelLength);

	SourceOpenCL = oclLoadProgSource(source, "// My comment\n", &szKernelLengthFilter);
	szKernelLengthSum = szKernelLength + szKernelLengthFilter;
	char* sourceCL = new char[szKernelLengthSum];
	strcpy(sourceCL,SourceOpenCLShared);
	strcat (sourceCL, SourceOpenCL);
	
	GPUProgram = clCreateProgramWithSource( GPUContext , 1, (const char **)&sourceCL, &szKernelLengthSum, &GPUError);
	CheckError(GPUError);

	// Build the program with 'mad' Optimization option
	char *flags = "-cl-unsafe-math-optimizations -cl-fast-relaxed-math -cl-mad-enable";

	GPUError = clBuildProgram(GPUProgram, 0, NULL, flags, NULL, NULL);
	//error checking code
	if(!GPUError)
	{
		//print kernel compilation error
		char programLog[1024];
		clGetProgramBuildInfo(GPUProgram, cdDevices[0], CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
		cout<<programLog<<endl;
	}


	cout << kernelFuncName << endl;

	GPUKernel = clCreateKernel(GPUProgram, kernelFuncName, &GPUError);
	CheckError(GPUError);

	

}
int main(void) {
	//###############################################
	//
	// Declare variables for OpenCL
	//
	//###############################################
	int err;               // error code returned from OpenCL calls

	size_t global;                  // global domain size

	cl_device_id device_id;     // compute device id
	cl_context context;       // compute context
	cl_command_queue commands;      // compute command queue
	cl_program program;       // compute program
	cl_kernel ko_calculate_imagerowdots_iterations;       // compute kernel
	cl_kernel ko_calculate_colorrow;       // compute kernel

	cl_mem d_a;                    // device memory used for the input  a vector
	cl_mem d_b;                    // device memory

	int i;

	//###############################################
	//
	// Set values for mandelbrot
	//
	//###############################################

	//plane section values
	float x_ebene_min = -1;
	float y_ebene_min = -1;
	float x_ebene_max = 2;
	float y_ebene_max = 1;

	//monitor resolution values
	const long x_mon = 640;
	const long y_mon = 480;

	//Iterations
	long itr = 100;

	//abort condition
	float abort_value = 2;

	//Number of images per second
	long fps = 24;

	//video duration in seconds
	long video_duration = 3;

	//zoom speed in percentage
	float reduction = 5;

	//zoom dot
	my_complex_t zoom_dot;

	//###############################################
	//
	// Set up platform and GPU device
	//
	//###############################################

	cl_uint numPlatforms;

	// Find number of platforms
	err = clGetPlatformIDs(0, NULL, &numPlatforms);
	checkError(err, "Finding platforms");
	if (numPlatforms == 0) {
		printf("Found 0 platforms!\n");
		return EXIT_FAILURE;
	}

	// Get all platforms
	cl_platform_id Platform[numPlatforms];
	err = clGetPlatformIDs(numPlatforms, Platform, NULL);
	checkError(err, "Getting platforms");

	// Secure a GPU
	for (i = 0; i < numPlatforms; i++) {
		err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
		if (err == CL_SUCCESS) {
			break;
		}
	}

	if (device_id == NULL)
		checkError(err, "Finding a device");

	err = output_device_info(device_id);
	checkError(err, "Printing device output");

	//###############################################
	//
	// Create context, command queue and kernel
	//
	//###############################################

	// Create a compute context
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	checkError(err, "Creating context");

	// Create a command queue
	commands = clCreateCommandQueue(context, device_id, 0, &err);
	checkError(err, "Creating command queue");

	//Read Kernel source
	FILE *fp;
	char *source_str;
	size_t source_size, program_size;

	fp = fopen("./kernel/calculate_iterations.cl", "r");
	if (!fp) {
		printf("Failed to load kernel\n");
		return 1;
	}

	fseek(fp, 0, SEEK_END);
	program_size = ftell(fp);
	rewind(fp);
	source_str = (char*) malloc(program_size + 1);
	source_str[program_size] = '\0';
	fread(source_str, sizeof(char), program_size, fp);
	fclose(fp);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **) &source_str,
	NULL, &err);

	checkError(err, "Creating program");

	// Build the program
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS) {
		size_t len;
		char buffer[2048];

		printf("Error: Failed to build program executable!\n%s\n",
				err_code(err));
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
				sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);

		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
				&log_size);

		// Allocate memory for the log
		char *log = (char *) malloc(log_size);

		// Get the log
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
				log_size, log, NULL);

		// Print the log
		printf("%s\n", log);

		return EXIT_FAILURE;
	}

	// Create the compute kernel from the program
	ko_calculate_imagerowdots_iterations = clCreateKernel(program,
			"calculate_imagerowdots_iterations", &err);
	checkError(err, "Creating kernel");

	// Create the compute kernel from the program
	ko_calculate_colorrow = clCreateKernel(program, "calculate_colorrow", &err);
	checkError(err, "Creating kernel");

	int number_images = 0;
	do {
		//Get memory for image
		long* h_image = (long*) calloc(x_mon * y_mon, sizeof(long));
		unsigned char* h_image_pixel = (unsigned char*) calloc(
				x_mon * y_mon * 3, sizeof(unsigned char));

		//###############################################
		//###############################################
		//
		// Loop to calculate image dot iterations
		//
		//###############################################
		//###############################################

		float y_value = y_ebene_max;
		float delta_y = delta(y_ebene_min, y_ebene_max, y_mon);

		for (int row = 0; row < y_mon; ++row) {
			//###############################################
			//
			// Create and write buffer
			//
			//###############################################

			//Get memory for row
			long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector

			d_a = clCreateBuffer(context, CL_MEM_READ_WRITE,
					sizeof(long) * x_mon,
					NULL, &err);
			checkError(err, "Creating buffer d_a");

			// Write a vector into compute device memory
			err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0,
					sizeof(long) * x_mon, h_image_row, 0, NULL, NULL);
			checkError(err, "Copying h_a to device at d_a");

			//###############################################
			//
			// Set the arguments to our compute kernel
			//
			//###############################################

			err = clSetKernelArg(ko_calculate_imagerowdots_iterations, 0,
					sizeof(float), &x_ebene_min);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 1,
					sizeof(float), &x_ebene_max);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 2,
					sizeof(float), &y_value);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 3,
					sizeof(long), &x_mon);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 4,
					sizeof(float), &abort_value);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 5,
					sizeof(long), &itr);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 6,
					sizeof(cl_mem), &d_a);
			checkError(err, "Setting kernel arguments");

			/*__kernel void calculate_imagerowdots_iterations(const float x_min, const float x_max,
			 const float y_value, const long x_mon, const float abort_value, const long itr,
			 __global long * imagerow)*/

			// Execute the kernel over the entire range of our 1d input data set
			// letting the OpenCL runtime choose the work-group size
			global = x_mon;
			err = clEnqueueNDRangeKernel(commands,
					ko_calculate_imagerowdots_iterations, 1, NULL, &global,
					NULL, 0,
					NULL, NULL);
			checkError(err, "Enqueueing kernel");

			// Wait for the commands to complete
			err = clFinish(commands);
			checkError(err, "Waiting for kernel to finish");

			// Read back the results from the compute device
			err = clEnqueueReadBuffer(commands, d_a, CL_TRUE, 0,
					sizeof(long) * x_mon, h_image_row, 0, NULL, NULL);
			if (err != CL_SUCCESS) {
				printf("Error: Failed to read output array!\n%s\n",
						err_code(err));
				exit(1);
			}

			//reduce y
			y_value -= delta_y;

			//cope row to image
			memcpy(h_image + row * x_mon, h_image_row, sizeof(long) * x_mon);

			free(h_image_row);
		}

//		for (i = 0; i < x_mon * y_mon; ++i) {
//			printf("%ld ", h_image[i]);
//		}
//		fflush(stdout);

		//###############################################
		//###############################################
		//
		// End of loop to calculate image dot iterations
		//
		//###############################################
		//###############################################

		//###############################################
		//###############################################
		//
		// Beginn color calculation
		//
		//###############################################
		//###############################################

		for (int row = 0; row < y_mon; ++row) {
			//Get memory for row
			long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector
			memcpy(h_image_row, h_image + row * x_mon, sizeof(long) * x_mon);

			d_a = clCreateBuffer(context, CL_MEM_READ_ONLY,
					sizeof(long) * x_mon,
					NULL, &err);
			checkError(err, "Creating buffer d_a");

			// Write a vector into compute device memory
			err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0,
					sizeof(long) * x_mon, h_image_row, 0, NULL, NULL);
			checkError(err, "Copying h_image_row to device at d_a");

			unsigned char* h_imagepixel_row = (unsigned char*) calloc(x_mon * 3,
					sizeof(unsigned char));     // a vector

			d_b = clCreateBuffer(context, CL_MEM_READ_WRITE,
					sizeof(unsigned char) * x_mon * 3,
					NULL, &err);
			checkError(err, "Creating buffer d_b");

			// Write a vector into compute device memory
			err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0,
					sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0,
					NULL, NULL);
			checkError(err, "Copying h_imagepixel_row to device at d_b");

			//###############################################
			//
			// Set the arguments to our compute kernel
			//
			//###############################################

			err = clSetKernelArg(ko_calculate_colorrow, 0, sizeof(long),
					&x_mon);
			err |= clSetKernelArg(ko_calculate_colorrow, 1, sizeof(long), &itr);
			err |= clSetKernelArg(ko_calculate_colorrow, 2, sizeof(cl_mem),
					&d_a);
			err |= clSetKernelArg(ko_calculate_colorrow, 3, sizeof(cl_mem),
					&d_b);
			checkError(err, "Setting kernel arguments");

			/*__kernel void calculate_colorrow(const long width, long itr, long * imagerowvalues,
			 unsigned char * imagerow)*/

			// Execute the kernel over the entire range of our 1d input data set
			// letting the OpenCL runtime choose the work-group size
			global = x_mon;
			err = clEnqueueNDRangeKernel(commands, ko_calculate_colorrow, 1,
			NULL, &global, NULL, 0,
			NULL, NULL);
			checkError(err, "Enqueueing kernel");

			// Wait for the commands to complete
			err = clFinish(commands);
			checkError(err, "Waiting for kernel to finish");

			// Read back the results from the compute device
			err = clEnqueueReadBuffer(commands, d_b, CL_TRUE, 0,
					sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0,
					NULL, NULL);
			if (err != CL_SUCCESS) {
				printf("Error: Failed to read output array!\n%s\n",
						err_code(err));
				exit(1);
			}

			memcpy(h_image_pixel + row * x_mon * 3, h_imagepixel_row,
					sizeof(unsigned char) * x_mon * 3);

			free(h_image_row);
			free(h_imagepixel_row);
		}

		if (number_images == 0) {
			zoom_dot = find_dot_to_zoom(x_ebene_min, x_ebene_max, y_ebene_min,
					y_ebene_max, h_image, y_mon, x_mon, itr);
		}

		reduce_plane_section_focus_dot(&x_ebene_min, &x_ebene_max, &y_ebene_min,
				&y_ebene_max, reduction, zoom_dot);


		// save the image
		char filename[50];
		sprintf(filename, "img-%d.bmp", number_images);

		safe_image_to_bmp(x_mon, y_mon, h_image_pixel, filename);

		free(h_image);
		free(h_image_pixel);

		number_images++;
		itr = (long) (itr + itr * reduction / 100);
		printf("%d\n", number_images);
		fflush(stdout);
	} while (number_images < (fps * video_duration));

	//###############################################
	//
	// cleanup then shutdown
	//
	//###############################################

	clReleaseMemObject(d_a);
	clReleaseMemObject(d_b);
	clReleaseProgram(program);
	clReleaseKernel(ko_calculate_imagerowdots_iterations);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	return 0;
}
Example #15
0
int main() {

   /* Host/device data structures */
   cl_platform_id platform;
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_int err;

   /* Program/kernel data structures */
   cl_program program;
   FILE *program_handle;
   char *program_buffer, *program_log;
   size_t program_size, log_size;
   cl_kernel kernel;
   size_t offset = 0;
   size_t global_size, local_size;

   /* Data and buffers */
   char pattern[16] = "thatwithhavefrom";
   FILE *text_handle;
   char *text;
   size_t text_size;
   int chars_per_item;
   int result[4] = {0, 0, 0, 0};
   cl_mem text_buffer, result_buffer;
   
   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn't identify a platform");
      exit(1);
   } 

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   if(err < 0) {
      perror("Couldn't access any devices");
      exit(1);   
   }

   /* Determine global size and local size */
   clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, 		
      sizeof(global_size), &global_size, NULL);	
   clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, 		
      sizeof(local_size), &local_size, NULL);
   global_size *= local_size;

   /* Create a context */
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Read program file and place content into buffer */
   program_handle = fopen(PROGRAM_FILE, "r");
   if(program_handle == NULL) {
      perror("Couldn't find the program file");
      exit(1);
   }
   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_buffer = (char*)calloc(program_size+1, sizeof(char));
   fread(program_buffer, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   /* Read text file and place content into buffer */
   text_handle = fopen(TEXT_FILE, "r");
   if(text_handle == NULL) {
      perror("Couldn't find the text file");
      exit(1);
   }
   fseek(text_handle, 0, SEEK_END);
   text_size = ftell(text_handle)-1;
   rewind(text_handle);
   text = (char*)calloc(text_size, sizeof(char));
   fread(text, sizeof(char), text_size, text_handle);
   fclose(text_handle);
   chars_per_item = text_size / global_size + 1;

   /* Create program from file */
   program = clCreateProgramWithSource(context, 1, 
      (const char**)&program_buffer, &program_size, &err);
   if(err < 0) {
      perror("Couldn't create the program");
      exit(1);
   }
   free(program_buffer);

   /* Build program */
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if(err < 0) {
            
      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &log_size);
      program_log = (char*) calloc(log_size+1, sizeof(char));
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            log_size+1, program_log, NULL);
      printf("%s\n", program_log);
      free(program_log);
      exit(1);
   }

   /* Create a kernel */
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);
   };

   /* Create buffers to hold the text characters and count */
   text_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
         CL_MEM_COPY_HOST_PTR, text_size, text, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };
   result_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
         CL_MEM_COPY_HOST_PTR, sizeof(result), result, NULL);

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(pattern), pattern);
   err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &text_buffer);
   err |= clSetKernelArg(kernel, 2, sizeof(chars_per_item), &chars_per_item);
   err |= clSetKernelArg(kernel, 3, 4 * sizeof(int), NULL);
   err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &result_buffer);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   err = clEnqueueNDRangeKernel(queue, kernel, 1, &offset, &global_size, 
         &local_size, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      printf("Error code: %d\n", err);
      exit(1);   
   }

   /* Read and print the result */
   err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, 
      sizeof(result), &result, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }

   printf("\nResults: \n");
   printf("Number of occurrences of 'that': %d\n", result[0]);
   printf("Number of occurrences of 'with': %d\n", result[1]);
   printf("Number of occurrences of 'have': %d\n", result[2]);
   printf("Number of occurrences of 'from': %d\n", result[3]);

   /* Deallocate resources */
   clReleaseMemObject(result_buffer);
   clReleaseMemObject(text_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
Example #16
0
int main() {

   /* Host/device data structures */
   cl_platform_id platform;
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_int err, i;

   /* Program/kernel data structures */
   cl_program program;
   FILE *program_handle;
   char *program_buffer, *program_log;
   size_t program_size, log_size;
   cl_kernel kernel;
   size_t global_size, local_size;

   /* Data and buffers */
   int num_rows, num_cols, num_values;
   int *rows, *cols;
   float *values, *b_vec;
   float result[2];
   double value_double;
   cl_mem rows_buffer, cols_buffer, values_buffer, 
         b_buffer, result_buffer;

   /* Read sparse file */
   FILE *mm_handle;
   MM_typecode code;

   /* Read matrix file */   
   if ((mm_handle = fopen(MM_FILE, "r")) == NULL) {
      perror("Couldn't open the MatrixMarket file");
      exit(1);
   }
   mm_read_banner(mm_handle, &code);
   mm_read_mtx_crd_size(mm_handle, &num_rows, &num_cols, &num_values);

   /* Check for symmetry and allocate memory */
   if(mm_is_symmetric(code) || mm_is_skew(code) || mm_is_hermitian(code)) {
      num_values += num_values - num_rows;
   }
   rows = (int*) malloc(num_values * sizeof(int));
   cols = (int*) malloc(num_values * sizeof(int));
   values = (float*) malloc(num_values * sizeof(float));
   b_vec = (float*) malloc(num_rows * sizeof(float));

   /* Read matrix data and close file */
   i=0;
   while(i<num_values) {
      fscanf(mm_handle, "%d %d %lg\n", &rows[i], &cols[i], &value_double);
      values[i] = (float)value_double;
      cols[i]--;
      rows[i]--;
      if((rows[i] != cols[i]) && (mm_is_symmetric(code) || mm_is_skew(code) || mm_is_hermitian(code))) {
         i++;
         rows[i] = cols[i-1];
         cols[i] = rows[i-1];
         values[i] = values[i-1];
      }
      i++;
   }
   sort(num_values, rows, cols, values);
   fclose(mm_handle);

   /* Initialize the b vector */
   srand(time(0));
   for(i=0; i<num_rows; i++) {
      b_vec[i] = (float)(rand() - RAND_MAX/2);
   }

   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn't identify a platform");
      exit(1);
   } 

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   if(err < 0) {
      perror("Couldn't access any devices");
      exit(1);   
   }

   /* Create a context */
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Read program file and place content into buffer */
   program_handle = fopen(PROGRAM_FILE, "r");
   if(program_handle == NULL) {
      perror("Couldn't find the program file");
      exit(1);
   }
   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_buffer = (char*)calloc(program_size+1, sizeof(char));
   fread(program_buffer, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   /* Create program from file */
   program = clCreateProgramWithSource(context, 1, 
      (const char**)&program_buffer, &program_size, &err);
   if(err < 0) {
      perror("Couldn't create the program");
      exit(1);
   }
   free(program_buffer);

   /* Build program */
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if(err < 0) {
            
      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &log_size);
      program_log = (char*) calloc(log_size+1, sizeof(char));
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            log_size+1, program_log, NULL);
      printf("%s\n", program_log);
      free(program_log);
      exit(1);
   }

   /* Create a kernel */
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      printf("Couldn't create a kernel: %d", err);
      exit(1);
   };

   /* Create buffers to hold the text characters and count */
   rows_buffer = clCreateBuffer(context, 
         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
         num_values*sizeof(int), rows, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };
   cols_buffer = clCreateBuffer(context,
         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
         num_values*sizeof(int), cols, NULL);
   values_buffer = clCreateBuffer(context,
         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
         num_values*sizeof(float), values, NULL);
   b_buffer = clCreateBuffer(context,
         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
         num_values*sizeof(float), b_vec, NULL);
   result_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
         2*sizeof(float), NULL, NULL);

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(num_rows), &num_rows);
   err |= clSetKernelArg(kernel, 1, sizeof(num_values), &num_values);
   err |= clSetKernelArg(kernel, 2, num_rows * sizeof(float), NULL);
   err |= clSetKernelArg(kernel, 3, num_rows * sizeof(float), NULL);
   err |= clSetKernelArg(kernel, 4, num_rows * sizeof(float), NULL);
   err |= clSetKernelArg(kernel, 5, num_rows * sizeof(float), NULL);
   err |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &rows_buffer);
   err |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cols_buffer);
   err |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &values_buffer);
   err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &b_buffer);
   err |= clSetKernelArg(kernel, 10, sizeof(cl_mem), &result_buffer);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   global_size = num_rows;
   local_size = num_rows;
   err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, 
         &local_size, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      printf("Error: %d\n", err);
      exit(1);
   }

   /* Read the results */
   err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, 
      2*sizeof(float), result, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }

   /* Print the result */
   printf("After %d iterations, the residual length is %f.\n", 
         (int)result[0], result[1]);

   /* Deallocate resources */
   free(b_vec);
   free(rows);
   free(cols);
   free(values);
   clReleaseMemObject(b_buffer);
   clReleaseMemObject(rows_buffer);
   clReleaseMemObject(cols_buffer);
   clReleaseMemObject(values_buffer);
   clReleaseMemObject(result_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
Example #17
0
/////////////////////////////////////////////////////////////////
// Create OpenCL memory buffers
/////////////////////////////////////////////////////////////////
bool initializeCLBuffers (JobToPUM *job) {
  cl_int status = CL_SUCCESS;

  int devID = job->runOn;
  cl_context *context = PUInfoStruct.PUsContexts[devID];

  cl_device_id device;// = calloc(1, sizeof(cl_device_id));

  size_t deviceListSize;
  status = clGetContextInfo(*context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize);
  if (status != CL_SUCCESS) {
    fprintf(stderr, "clGetContextInfo failed (%i).\n", status);
    return false;
  }

  //Get the corresponding device (TODO: currently only one)
  status = clGetContextInfo(*context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL);
  if (status != CL_SUCCESS) {
    fprintf(stderr, "clGetContextInfo failed (%i).\n", status);
    return false;
  }

  PUInfoStruct.argBuffers[devID] = calloc(job->nTotalArgs, sizeof(cl_mem));


  //TODO: support other memory locations
  for (cl_uint i = 0; i < job->nTotalArgs; i++) {
    /* Create buffers */
    PUInfoStruct.argBuffers[devID][i] = clCreateBuffer(*context, CL_MEM_READ_WRITE, job->argSizes[i], 0, &status);
    if (status != CL_SUCCESS) {
      fprintf(stderr,"clCreateBuffer failed. (inputBuffers[%i]))\n",i);
      return false;
    }

    if ((job->argTypes[i] == INPUT) ||
        (job->argTypes[i] == INPUT_OUTPUT)) {
      if (job->probID == myid && job->jobID == 2){
        for (int j = 0; j < 30; j++) {
          status = clEnqueueWriteBuffer(*(PUInfoStruct.PUsCmdQs[devID]), PUInfoStruct.argBuffers[devID][i], CL_TRUE, 0, job->argSizes[i], job->arguments[i], 0, 0, 0);
          if (status != CL_SUCCESS) {
            fprintf(stderr,"clEnqueueWriteBuffer failed. (inputBuffers[%i])\n",i);
            return false;
          }
        }
      }
      else {
        status = clEnqueueWriteBuffer(*(PUInfoStruct.PUsCmdQs[devID]), PUInfoStruct.argBuffers[devID][i], CL_TRUE, 0, job->argSizes[i], job->arguments[i], 0, 0, 0);
        if (status != CL_SUCCESS) {
          fprintf(stderr,"clEnqueueWriteBuffer failed. (inputBuffers[%i])\n",i);
          return false;
        }
      }
    }
  }

  /////////////////////////////////////////////////////////////////
  // Load CL source, build CL program object, create CL kernel object
  /////////////////////////////////////////////////////////////////
  /* create a CL program using the kernel source */
  cl_program program;// = malloc(sizeof(cl_program));
//  PUInfoStruct.currKernels[devID] = malloc(sizeof(cl_kernel));

  const char *sourceStr = job->taskSource;
  if (debug_PUM)
    fprintf(stderr, "TRYING (%i)\n", job->taskSourceSize);
  if (sourceStr == NULL) {
    fprintf(stderr, "initializeCLBuffers srcStr is NULL\n");
    return false;
  }
  size_t sourceSize[] = { strlen(sourceStr) };
  assert(sourceSize[0] == job->taskSourceSize-1);

  program = clCreateProgramWithSource(*context, 1, &sourceStr, sourceSize, &status);
  if (status != CL_SUCCESS) {
    fprintf(stderr, "clCreateProgramWithSource failed.\n");
    return false;
  }

  /* create a cl program executable for all the devices specified */
  //currently only one device supported
  status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

  size_t ret_val_size;
  clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);


  char *build_log = calloc(ret_val_size+1, sizeof(char));
  clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);

  build_log[ret_val_size] = '\0';

  if (!SILENT){
    if (strlen(build_log) > 0) {
      fprintf(stderr, "PUM (%i): buildlog:\n>>>\n%s\n<<<\n__END OF BUILDLOG__\n",myid, build_log);
    } else {
      fprintf(stderr, "PUM (%i): NO BUILD LOG\n", myid);
    }
  }
  free(build_log);

  if (status != CL_SUCCESS) {
    fprintf(stderr, "clBuildProgram failed (%i).\n", status);
    return false;
  }

  /* get a kernel object handle for a kernel with the given name */
  PUInfoStruct.currKernels[devID] = clCreateKernel(program, job->startingKernel, &status);
  if (status != CL_SUCCESS) {
    fprintf(stderr, "clCreateKernel failed.\n");
    return false;
  }

//  free (device);
//  free (program); //TODO: check if this should be persistent

  return true;

}
Example #18
0
    cl_program getOrBuildProgram(const Context* ctx, const cv::ocl::ProgramEntry* source, const String& options)
    {
        cl_int status = 0;
        cl_program program = NULL;
        std::vector<char> binary;
        if (!enable_disk_cache || !readConfigurationFromFile(options, binary))
        {
            program = clCreateProgramWithSource(getClContext(ctx), 1, (const char**)&source->programStr, NULL, &status);
            openCLVerifyCall(status);
            cl_device_id device = getClDeviceID(ctx);
            status = clBuildProgram(program, 1, &device, options.c_str(), NULL, NULL);
            if(status == CL_SUCCESS)
            {
                if (enable_disk_cache)
                {
                    size_t binarySize;
                    openCLSafeCall(clGetProgramInfo(program,
                                            CL_PROGRAM_BINARY_SIZES,
                                            sizeof(size_t),
                                            &binarySize, NULL));

                    std::vector<char> binary(binarySize);

                    char* ptr = &binary[0];
                    openCLSafeCall(clGetProgramInfo(program,
                                            CL_PROGRAM_BINARIES,
                                            sizeof(char*),
                                            &ptr,
                                            NULL));

                    if (!writeConfigurationToFile(options, binary))
                    {
                        std::cerr << "Can't write data to file: " << fileName_ << std::endl;
                    }
                }
            }
        }
        else
        {
            cl_device_id device = getClDeviceID(ctx);
            size_t size = binary.size();
            const char* ptr = &binary[0];
            program = clCreateProgramWithBinary(getClContext(ctx),
                    1, &device,
                    (const size_t *)&size, (const unsigned char **)&ptr,
                    NULL, &status);
            openCLVerifyCall(status);
            status = clBuildProgram(program, 1, &device, options.c_str(), NULL, NULL);
        }

        if(status != CL_SUCCESS)
        {
            if(status == CL_BUILD_PROGRAM_FAILURE)
            {
                size_t buildLogSize = 0;
                openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
                        CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize));
                std::vector<char> buildLog; buildLog.resize(buildLogSize);
                memset(&buildLog[0], 0, buildLogSize);
                openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
                        CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[0], NULL));
                std::cout << std::endl << "BUILD LOG: "
                        << (source->name ? source->name : "dynamic program") << ": "
                        << options << "\n";
                std::cout << &buildLog[0] << std::endl;
            }
            openCLVerifyCall(status);
        }
        return program;
    }
Example #19
0
int main(int argc, char** argv) {

   /* OpenCL 1.1 data structures */
   cl_platform_id* platforms;
   cl_program program;
   cl_context context;

   /* OpenCL 1.1 scalar data types */
   cl_uint numOfPlatforms;
   cl_int  error;

   /*
    Prepare an array of __cl_float4 via dynamic memory allocation
    This will map to the native vector type which is SSE / SSE2 / AVX on
    Intel-compatible processors.
   */
   cl_float8* ud_in = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // input to device
   cl_float8* ud_out = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // output from device
   for( int i = 0; i < DATA_SIZE; ++i) {
       ud_in[i] = (cl_float8){(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i};
       ud_out[i] = (cl_float8){(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f};
   }

   /* 
      Get the number of platforms 
      Remember that for each vendor's SDK installed on the computer,
      the number of available platform also increased. 
    */
   error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }

   platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
   printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);

   error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }
   // Search for a CPU/GPU device through the installed platforms
   // Build a OpenCL program and do not run it.
   for(cl_uint i = 0; i < numOfPlatforms; i++ ) {

        cl_uint numOfDevices = 0;

        /* Determine how many devices are connected to your platform */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices);

        /* Load the information about your devices into the variable 'devices' */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        printf("Number of detected OpenCL devices: %d\n", numOfDevices);

	    /* Create a context */
        cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 };
	    context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a valid OpenCL context");
	        exit(1);
	    }

	    /* For each device, create a buffer and partition that data among the devices for compute! */
	    cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
	                                  sizeof(cl_float8) * DATA_SIZE, ud_in, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a buffer");
	        exit(1);
	    }

        int offset = 0; 
        for(int i = 0; i < numOfDevices; ++i, ++offset ) {
	        /* Load the two source files into temporary datastores */
	        const char *file_names[] = {"vectorization.cl"}; 
	        const int NUMBER_OF_FILES = 1;
	        char* buffer[NUMBER_OF_FILES];
	        size_t sizes[NUMBER_OF_FILES];
	        loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
	
	        /* Create the OpenCL program object */
	        program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);				
		    if(error != CL_SUCCESS) {
		      perror("Can't create the OpenCL program object");
		      exit(1);   
		    }

	        /* Build OpenCL program object and dump the error message, if any */
	        char *program_log;
	        size_t log_size;
            char* build_options = "-fbin-llvmir -fbin-amdil -fbin-exe";
	        error = clBuildProgram(program, 1, &devices[i], build_options, NULL, NULL);		
		    if(error != CL_SUCCESS) {
		      // If there's an error whilst building the program, dump the log
		      clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		      program_log = (char*) malloc(log_size+1);
		      program_log[log_size] = '\0';
		      clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 
		            log_size+1, program_log, NULL);
		      printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
		      free(program_log);
		      exit(1);
		    }
	  
	        /* Query the program as to how many kernels were detected */
	        cl_uint numOfKernels;
	        error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels);
	        if (error != CL_SUCCESS) {
	            perror("Unable to retrieve kernel count from program");
	            exit(1);
	        }
	        cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels);
	        error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL);

            /* Loop thru each kernel and execute on device */
	        for(cl_uint j = 0; j < numOfKernels; j++) {
	            char kernelName[32];
	            cl_uint argCnt;
	            clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL);
	            clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL);
	            printf("Kernel name: %s with arity: %d\n", kernelName, argCnt);
	            printf("About to create command queue and enqueue this kernel...\n");
	
	            /* Create a command queue */
	            cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create command-queue");
	                exit(1);
	            }
	
                /* Create a buffer and copy the data from the main buffer */
	            cl_mem outobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 
	                                                sizeof(cl_float8) * DATA_SIZE, 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create sub-buffer object");
	                exit(1);
	            }

	            /* Let OpenCL know that the kernel is suppose to receive an argument */
	            error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj);
	            error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to set buffer object in kernel");
	                exit(1);
	            }
	
	            /* Enqueue the kernel to the command queue */
	            error = clEnqueueTask(cQ, kernels[j], 0, NULL, NULL);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to enqueue task to command-queue");
	                exit(1);
	            }
	            printf("Task has been enqueued successfully!\n");

	            /* Enqueue the read-back from device to host */
	            error = clEnqueueReadBuffer(cQ, outobj,
	                                         CL_TRUE,               // blocking read
	                                         0,                      // read from the start
	                                         sizeof(cl_float8)*DATA_SIZE,          // how much to copy
	                                         ud_out, 0, NULL, NULL);
                /* Check the returned data */
	            if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) {
	                printf("Check passed!\n");
	            } else printf("Check failed!\n");
	
	            /* Release the command queue */
	            clReleaseCommandQueue(cQ);
	            clReleaseMemObject(outobj);
	        } 

        /* Clean up */
        
        for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); }
        for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); }
        clReleaseProgram(program);
    }// end of device loop and execution
    
	    clReleaseMemObject(inobj);
        clReleaseContext(context);
   }// end of platform loop

   free(ud_in);
   free(ud_out);
}
Example #20
0
int main(int argc, char **argv){
	
	printf("Check OpenCL environtment\n");

	cl_platform_id platid;
	cl_device_id devid;
	cl_int res;
	size_t param;
	
	/* Query OpenCL, get some information about the returned device */
	clGetPlatformIDs(1u, &platid, NULL);
	clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL);

	cl_char vendor_name[1024] = {0};
	cl_char device_name[1024] = {0};
	clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL);
	clGetDeviceInfo(devid, CL_DEVICE_NAME,   sizeof(device_name), device_name, NULL);
	printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name);
	
	clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &param, NULL);
	printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param);
	
	clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param);

	clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param);

	/* Check if kernel source exists, we compile argv[1] passed kernel */
	if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); }

	char *kernel_source;
	if(load_program_source(argv[1], &kernel_source)) return 1;
	
	printf("Building from OpenCL source: \t%s\n", argv[1]);
	printf("Compile/query OpenCL_program:\t%s\n", argv[2]);
	
	/* Create context and kernel program */
	cl_context context = 	clCreateContext(0, 1, &devid, NULL, NULL, NULL);
	cl_program pro = 	clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL);
	res = 			clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL);

	if(res != CL_SUCCESS){
		printf("clBuildProgram failed: %d\n", res); char buf[0x10000];
		clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL);
		printf("\n%s\n", buf); return(-1); }

	cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); 	check_return(res);
	
	/* Get the maximum work-group size for executing the kernel on the device */
	size_t global, local;
	res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL);		check_return(res);
	printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local);
	
	res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param, NULL);	check_return(res);
	printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param);
	
	cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL);
	if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); }

	local = 4;
	int n = 2 * local;	//num_group * local workgroup size 
	global = n;
	
	int	num_groups=		global / local,
		allocated_local=	sizeof(data) * local + 
					sizeof(debug) * local;

	data *DP __attribute__ ((aligned(16)));
	DP = calloc(n, sizeof(data) *1);

	debug *dbg __attribute__ ((aligned(16)));
	dbg = calloc(n, sizeof(debug));
	
	printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups);
	printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256));
	printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local);

	cl_mem	cl_DP, cl_EC, cl_INV, DEBUG;
	cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res);					check_return(res);				
	cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY,  1 * sizeof(Elliptic_Curve), NULL, &res);	check_return(res);	//_constant address space
	cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY,  1 * sizeof(u8) * 0x80, NULL, &res);		check_return(res);
	DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res);		check_return(res);
	
	Elliptic_Curve EC;
	/*	
		Curve domain parameters, (test vectors)
		-------------------------------------------------------------------------------------
		p:	c1c627e1638fdc8e24299bb041e4e23af4bb5427		is prime
		a:	c1c627e1638fdc8e24299bb041e4e23af4bb5424		divisor g = 62980
		b:	877a6d84155a1de374b72d9f9d93b36bb563b2ab		divisor g = 227169643
		Gx: 	010aff82b3ac72569ae645af3b527be133442131		divisor g = 32209245
		Gy: 	46b8ec1e6d71e5ecb549614887d57a287df573cc		divisor g = 972	
		precomputed_per_curve_constants:
		U:	c1c627e1638fdc8e24299bb041e4e23af4bb5425
		V:	3e39d81e9c702371dbd6644fbe1b1dc50b44abd9
		
		already prepared mod p to test:
		a:      07189f858e3f723890a66ec1079388ebd2ed509c
		b:      6043379beb0dade6eed1e9d6de64f4a0c50639d4
		gx:     5ef84aacf4f0ea6752f572d0741f40049f354dca
		gy:     418c695435af6b3d4d7cbb72967395016ef67239
		resulting point:
		P.x:    01718f862ebe9423bd661a65355aa1c86ba330f8		program MUST got this point !!
		P.y:    557e8ed53ffbfe2c990a121967b340f62e0e4fe2
		taken mod p:
		P.x:    41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3		
		P.y:    73ca143c9badedf2d9d3c7573307115ccfe04f13
	*/	
	u8 *t;
	t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427");	memcpy(EC.p, t, 20);
	t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c");	memcpy(EC.a, t, 20);
	t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4");	memcpy(EC.b, t, 20);
	t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca");	memcpy(EC.Gx, t, 20);
	t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239");	memcpy(EC.Gy, t, 20);
	
	t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425");	memcpy(EC.U, t, 20);
	t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9");	memcpy(EC.V, t, 20);

	/* we need to map buffer now to load some k into data */
	DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data),  0, NULL, NULL, &res);	check_return(res);

	t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710");
	for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21);
	
	free(t);
//d	for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1);

	/* we can alter just a byte into a chosen k to verify that we'll get a different point! */
	//DP[2].k[2] = 0x09;
	
//no	res = clEnqueueWriteBuffer(cmd_queue, cl_DP,  CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL);	check_return(res);

	res = clEnqueueWriteBuffer(cmd_queue, cl_EC,  CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL);	check_return(res);
	res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL);	check_return(res);

	res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP);		/* i/o buffer */
	res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL);	//allocate space for __local in kernel (just this!) one * localsize
	res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC);
	res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV);	
	res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL);	//allocate space for __local in kernel (just this!) one * localsize
	res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG);		//this used to debug kernel output
	check_return(res);

//	printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local);	
	
	cl_event NDRangeEvent;
	cl_ulong start, end;
	
	/* Execute NDrange */	
	res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent);		check_return(res);
//	res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent);		check_return(res);
	
	printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data));

	DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data),  0, NULL, NULL, &res);	check_return(res);
	dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res);	check_return(res);
	
	/* using clEnqueueReadBuffer template */
//	res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL);			check_return(res);
		
	clFlush(cmd_queue);
	clFinish(cmd_queue);

	/* get NDRange execution time with internal ocl profiler */
	res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &end,   NULL);
	check_return(res);
	printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000));			//relative to NDRange call
	printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start)));

	
	printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n");
	for(int i = 0; i < n; i++) {		
//		if(i %local == 0) {
			printf("%d \t", i);
			//printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7));
			
			/* silence this doubled debug info
			printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", 
				dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3],
				dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]);
			*/	
			//printf("%d %d\n", P[i].dig, P[i].c);
			bn_print("", DP[i].k, 21, 1);
			bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1);
			
			printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", 
				DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3],
				DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]);
//		}
	}
	
	/* Release OpenCL stuff, free the rest */
	clReleaseMemObject(cl_DP);
	clReleaseMemObject(cl_EC);
	clReleaseMemObject(cl_INV);
	clReleaseMemObject(DEBUG);
	clReleaseKernel(kernelobj);
	clReleaseProgram(pro);
	clReleaseCommandQueue(cmd_queue);
	clReleaseContext(context);
	
	free(kernel_source);
	
	puts("Done!");
	return 0;
}
int main()
{
	srand(unsigned(time(nullptr)));
	int err;                            // error code returned from api calls

	cl_device_id device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_kernel kernel;                   // compute kernel

										// OpenCL device memory for matrices
	cl_mem d_A;
	cl_mem d_B;
	cl_mem d_C;

	// set seed for rand()
	srand(2014);

	//Allocate host memory for matrices A and B
	unsigned int size_A = WA * HA;
	unsigned int mem_size_A = sizeof(float) * size_A;
	float* h_A = (float*)malloc(mem_size_A);

	unsigned int size_B = WB * HB;
	unsigned int mem_size_B = sizeof(float) * size_B;
	float* h_B = (float*)malloc(mem_size_B);

	//Initialize host memory
	randomMemInit(h_A, size_A);
	randomMemInit(h_B, size_B);

	//Allocate host memory for the result C
	unsigned int size_C = WC * HC;
	unsigned int mem_size_C = sizeof(float) * size_C;
	float* h_C = (float*)malloc(mem_size_C);

	printf("Initializing OpenCL device...\n");

	cl_uint dev_cnt = 0;
	clGetPlatformIDs(0, 0, &dev_cnt);

	cl_platform_id platform_ids[100];
	clGetPlatformIDs(dev_cnt, platform_ids, NULL);

	// Connect to a compute device
	int gpu = 1;
	err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
	if (err != CL_SUCCESS){
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

	// Create a compute context 
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context){
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	// Create a command commands
	commands = clCreateCommandQueue(context, device_id, 0, &err);
	if (!commands){
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source file
	char *KernelSource;
	long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource);
	if (lFileSize < 0L){
		perror("File read failed");
		return 1;
	}
	//const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl");

	program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err);
	if (!program){
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS){
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		exit(1);
	}

	// Create the compute kernel in the program we wish to run
	kernel = clCreateKernel(program, "matrixMul", &err);
	if (!kernel || err != CL_SUCCESS){
		printf("Error: Failed to create compute kernel!\n");
		exit(1);
	}

	// Create the input and output arrays in device memory for our calculation
	d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err);
	d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err);
	d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err);

	if (!d_A || !d_B || !d_C){
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}

	printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB);

	//Launch OpenCL kernel
	size_t localWorkSize[2], globalWorkSize[2];

	int wA = WA;
	int wC = WC;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B);
	err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA);
	err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC);

	if (err != CL_SUCCESS){
		printf("Error: Failed to set kernel arguments! %d\n", err);
		exit(1);
	}

	localWorkSize[0] = 16;
	localWorkSize[1] = 16;
	globalWorkSize[0] = 1024;
	globalWorkSize[1] = 1024;

	err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to execute kernel! %d\n", err);
		exit(1);
	}

	//Retrieve result from device
	err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to read output array! %d\n", err);
		exit(1);
	}
	//print table A

	printf("\nMatrix A\n");
	for (int i = 0; i < size_A; i++){
		printf("%f\t", h_A[i]);
		if (((i + 1) % WA) == 0)
			printf("\n");
	}

	//print table B

	printf("\nMatrix B\n");
	for (int i = 0; i < size_B; i++){
		printf("%f\t", h_B[i]);
		if (((i + 1) % WB) == 0)
			printf("\n");
	}

	//print out the results

	printf("\nMatrix C (Results)\n");
	for (int i = 0; i < size_C; i++){
		printf("%f\t", h_C[i]);
		if (((i + 1) % WC) == 0)
			printf("\n");
	}
	printf("\n");


	printf("Matrix multiplication completed...\n");

	//Shutdown and cleanup
	free(h_A);
	free(h_B);
	free(h_C);

	clReleaseMemObject(d_A);
	clReleaseMemObject(d_C);
	clReleaseMemObject(d_B);

	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	std::cin.clear();
	std::cin.sync();
	std::cin.get();
}
Example #22
0
int
main ()
{
  int err, i;
  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_context_properties context_props[3];
  cl_command_queue queue;
  cl_program program;
  cl_kernel kernel;
  cl_mem buffer;

  size_t len;
  const char *program_source = NULL;
  char *device_extensions = NULL;
  char kernel_build_opts[256];
  size_t size = sizeof (cl_int) * SIZE;
  const size_t global_work_size[] = {SIZE, 0, 0}; /* size of each dimension */
  cl_int *data;

  /* In order to see which devices the OpenCL implementation on your platform
     provides you may issue a call to the print_clinfo () fuction.  */

  /* Initialize the data the OpenCl program operates on.  */
  data = (cl_int*) calloc (1, size);
  if (data == NULL)
    {
      fprintf (stderr, "calloc failed\n");
      exit (EXIT_FAILURE);
    }

  /* Pick the first platform.  */
  CHK (clGetPlatformIDs (1, &platform, NULL));
  /* Get the default device and create context.  */
  CHK (clGetDeviceIDs (platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL));
  context_props[0] = CL_CONTEXT_PLATFORM;
  context_props[1] = (cl_context_properties) platform;
  context_props[2] = 0;
  context = clCreateContext (context_props, 1, &device, NULL, NULL, &err);
  CHK_ERR ("clCreateContext", err);
  queue = clCreateCommandQueue (context, device, 0, &err);
  CHK_ERR ("clCreateCommandQueue", err);

  /* Query OpenCL extensions of that device.  */
  CHK (clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, 0, NULL, &len));
  device_extensions = (char *) malloc (len);
  CHK (clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, len, device_extensions,
			NULL));
  strcpy (kernel_build_opts, "-Werror -cl-opt-disable");
  if (strstr (device_extensions, "cl_khr_fp64") != NULL)
    strcpy (kernel_build_opts + strlen (kernel_build_opts),
	    " -D HAVE_cl_khr_fp64");
  if (strstr (device_extensions, "cl_khr_fp16") != NULL)
    strcpy (kernel_build_opts + strlen (kernel_build_opts),
	    " -D HAVE_cl_khr_fp16");

  /* Read the OpenCL kernel source into the main memory.  */
  program_source = read_file (STRINGIFY (CL_SOURCE), &len);
  if (program_source == NULL)
    {
      fprintf (stderr, "file does not exist: %s\n", STRINGIFY (CL_SOURCE));
      exit (EXIT_FAILURE);
    }

  /* Build the OpenCL kernel.  */
  program = clCreateProgramWithSource (context, 1, &program_source,
				       &len, &err);
  free ((void*) program_source);
  CHK_ERR ("clCreateProgramWithSource", err);
  err = clBuildProgram (program, 0, NULL, kernel_build_opts, NULL,
			NULL);
  if (err != CL_SUCCESS)
    {
      size_t len;
      char *clbuild_log = NULL;
      CHK (clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, 0,
				  NULL, &len));
      clbuild_log = malloc (len);
      if (clbuild_log)
	{
	  CHK (clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG,
				      len, clbuild_log, NULL));
	  fprintf (stderr, "clBuildProgram failed with:\n%s\n", clbuild_log);
 	  free (clbuild_log);
        }
      exit (EXIT_FAILURE);
  }

  /* In some cases it might be handy to save the OpenCL program binaries to do
     further analysis on them.  In order to do so you may call the following
     function: save_program_binaries (program);.  */

  kernel = clCreateKernel (program, "testkernel", &err);
  CHK_ERR ("clCreateKernel", err);

  /* Setup the input data for the kernel.  */
  buffer = clCreateBuffer (context, CL_MEM_USE_HOST_PTR, size, data, &err);
  CHK_ERR ("clCreateBuffer", err);

  /* Execute the kernel (data parallel).  */
  CHK (clSetKernelArg (kernel, 0, sizeof (buffer), &buffer));
  CHK (clEnqueueNDRangeKernel (queue, kernel, 1, NULL, global_work_size, NULL,
			       0, NULL, NULL));

  /* Fetch the results (blocking).  */
  CHK (clEnqueueReadBuffer (queue, buffer, CL_TRUE, 0, size, data, 0, NULL,
			    NULL));

  /* Compare the results.  */
  for (i = 0; i < SIZE; i++)
    {
      if (data[i] != 0x1)
	{
	  fprintf (stderr, "error: data[%d]: %d != 0x1\n", i, data[i]);
	  exit (EXIT_FAILURE);
	}
    }

  /* Cleanup.  */
  CHK (clReleaseMemObject (buffer));
  CHK (clReleaseKernel (kernel));
  CHK (clReleaseProgram (program));
  CHK (clReleaseCommandQueue (queue));
  CHK (clReleaseContext (context));
  free (data);

  return 0;
}
Example #23
0
/**
 * @brief Creates an array of objects containing the OpenCL variables of each device
 * @param trDataBase The training database which will contain the instances and the features
 * @param selInstances The instances choosen as initial centroids
 * @param transposedTrDataBase The training database already transposed
 * @param conf The structure with all configuration parameters
 * @return A pointer containing the objects
 */
CLDevice *createDevices(const float *const trDataBase, const int *const selInstances, const float *const transposedTrDataBase, Config *const conf) {


	/********** Find the OpenCL devices specified in configuration ***********/

	// OpenCL variables
	cl_uint numPlatformsDevices;
	cl_device_type deviceType;
	cl_program program;
	cl_kernel kernel;
	cl_int status;

	// Others variables
	auto allDevices = getAllDevices();
	CLDevice *devices = new CLDevice[conf -> nDevices + (conf -> ompThreads > 0)];

	for (int dev = 0; dev < conf -> nDevices; ++dev) {

		bool found = false;
		for (int allDev = 0; allDev < allDevices.size() && !found; ++allDev) {

			// Get the specified OpenCL device
			char dbuff[120];
			check(clGetDeviceInfo(allDevices[allDev], CL_DEVICE_NAME, sizeof(dbuff), dbuff, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_NAME);

			// If the device exists...
			if (conf -> devices[dev] == dbuff) {
				devices[dev].device = allDevices[allDev];
				devices[dev].deviceName = dbuff;
				check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_TYPE, sizeof(cl_device_type), &(devices[dev].deviceType), NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_TYPE);


				/********** Device local memory usage ***********/

				long int usedMemory = conf -> nFeatures * sizeof(cl_uchar); // Chromosome of the individual
				usedMemory += conf -> trNInstances * sizeof(cl_uchar); // Mapping buffer
				usedMemory += conf -> K * conf -> nFeatures * sizeof(cl_float); // Centroids buffer
				usedMemory += conf -> trNInstances * sizeof(cl_float); // DistCentroids buffer
				usedMemory += conf -> K * sizeof(cl_int); // Samples_in_k buffer

				// Get the maximum local memory size
				long int maxMemory;
				check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long int), &maxMemory, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_MAXMEM);

				// Avoid exceeding the maximum local memory available. 1024 bytes of margin
				check(usedMemory > maxMemory - 1024, "%s:\n\tMax memory: %ld bytes\n\tAllow memory: %ld bytes\n\tUsed memory: %ld bytes\n", CL_ERROR_DEVICE_LOCALMEM, maxMemory, maxMemory - 1024, usedMemory);


				/********** Create context ***********/

				devices[dev].context = clCreateContext(NULL, 1, &(devices[dev].device), 0, 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_CONTEXT);


				/********** Create Command queue ***********/

				devices[dev].commandQueue = clCreateCommandQueue(devices[dev].context, devices[dev].device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_QUEUE);


				/********** Create kernel ***********/

				// Open the file containing the kernels
				std::fstream kernels(conf -> kernelsFileName.c_str(), std::fstream::in);
				check(!kernels.is_open(), "%s\n", CL_ERROR_FILE_OPEN);

				// Obtain the size
				kernels.seekg(0, kernels.end);
				size_t fSize = kernels.tellg();
				kernels.seekg(0, kernels.beg);

				char *kernelSource = new char[fSize];
				kernels.read(kernelSource, fSize);
				kernels.close();

				// Create program
				program = clCreateProgramWithSource(devices[dev].context, 1, (const char **) &kernelSource, &fSize, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_BUILD);

				// Build program for the device in the context
				char buildOptions[196];
				sprintf(buildOptions, "-I include -D N_INSTANCES=%d -D N_FEATURES=%d -D N_OBJECTIVES=%d -D K=%d -D MAX_ITER_KMEANS=%d", conf -> trNInstances, conf -> nFeatures, conf -> nObjectives, conf -> K, conf -> maxIterKmeans);
				if (clBuildProgram(program, 1, &(devices[dev].device), buildOptions, 0, 0) != CL_SUCCESS) {
					char buffer[4096];
					fprintf(stderr, "Error: Could not build the program\n");
					check(clGetProgramBuildInfo(program, devices[dev].device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_ERRORS);
					check(true, "%s\n", buffer);
				}

				// Create kernel
				const char *kernelName = (devices[dev].deviceType == CL_DEVICE_TYPE_GPU) ? "kmeansGPU" : "";
				devices[dev].kernel = clCreateKernel(program, kernelName, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_BUILD);


				/******* Work-items *******/

				devices[dev].computeUnits = atoi(conf -> computeUnits[dev].c_str());
				devices[dev].wiLocal = atoi(conf -> wiLocal[dev].c_str());
				devices[dev].wiGlobal = devices[dev].computeUnits * devices[dev].wiLocal;


				/******* Create and write the databases and centroids buffers. Create the subpopulations buffer. Set kernel arguments *******/

				// Create buffers
				devices[dev].objSubpopulations = clCreateBuffer(devices[dev].context, CL_MEM_READ_WRITE, conf -> familySize * sizeof(Individual), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_SUBPOPS);

				devices[dev].objTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TRDB);

				devices[dev].objTransposedTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TTRDB);

				devices[dev].objSelInstances = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> K * sizeof(cl_int), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_CENTROIDS);

				// Sets kernel arguments
				check(clSetKernelArg(devices[dev].kernel, 0, sizeof(cl_mem), (void *)&(devices[dev].objSubpopulations)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT1);

				check(clSetKernelArg(devices[dev].kernel, 1, sizeof(cl_mem), (void *)&(devices[dev].objSelInstances)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT2);

				check(clSetKernelArg(devices[dev].kernel, 2, sizeof(cl_mem), (void *)&(devices[dev].objTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT3);

				check(clSetKernelArg(devices[dev].kernel, 5, sizeof(cl_mem), (void *)&(devices[dev].objTransposedTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT6);

				// Write buffers
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), trDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TRDB);
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objSelInstances, CL_FALSE, 0, conf -> K * sizeof(cl_int), selInstances, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_CENTROIDS);
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTransposedTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), transposedTrDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TTRDB);

				// Resources used are released
				delete[] kernelSource;
				clReleaseProgram(program);

				found = true;
				allDevices.erase(allDevices.begin() + allDev);
			}
		}

		check(!found, "%s\n", CL_ERROR_DEVICE_FOUND);
	}


	/********** Add the CPU if has been enabled in configuration ***********/

	if (conf -> ompThreads > 0) {
		devices[conf -> nDevices].deviceType = CL_DEVICE_TYPE_CPU;
		devices[conf -> nDevices].computeUnits = conf -> ompThreads;
		++(conf -> nDevices);
	}

	return devices;
}
Example #24
0
void 
init(OptionParser& op, bool _do_dp)
{
    cl_int err;

    do_dp = _do_dp;
    
    if (!fftCtx) {
        // first get the device
        int device, platform = op.getOptionInt("platform");
        if (op.getOptionVecInt("device").size() > 0) {
            device = op.getOptionVecInt("device")[0];
        }
        else {
            device = 0;
        }
        fftDev = ListDevicesAndGetDevice(platform, device);

        // now get the context
        fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err);
        CL_CHECK_ERROR(err);
    }

    if (!fftQueue) {
        // get a queue
        fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE,
                                        &err);
        CL_CHECK_ERROR(err);
    }
    
    // create the program...
    fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err);
    CL_CHECK_ERROR(err);
    
    // ...and build it
    string args = " -cl-mad-enable ";
    if (op.getOptionBool("use-native")) {
        args += " -cl-fast-relaxed-math ";
    }
    if (!do_dp) {
        args += " -DSINGLE_PRECISION ";
    }
    else if (checkExtension(fftDev, "cl_khr_fp64")) {
        args += " -DK_DOUBLE_PRECISION ";
    }
    else if (checkExtension(fftDev, "cl_amd_fp64")) {
        args += " -DAMD_DOUBLE_PRECISION ";
    }
    err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL);
    {
        char* log = NULL;
        size_t bytesRequired = 0;
        err = clGetProgramBuildInfo(fftProg, 
                                    fftDev, 
                                    CL_PROGRAM_BUILD_LOG,
                                    0,
                                    NULL,
                                    &bytesRequired );
        log = (char*)malloc( bytesRequired + 1 );
        err = clGetProgramBuildInfo(fftProg, 
                                    fftDev, 
                                    CL_PROGRAM_BUILD_LOG,
                                    bytesRequired,
                                    log,
                                    NULL );
        std::cout << log << std::endl;
        free( log );
    }
    if (err != CL_SUCCESS) {
        char log[50000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG,
                                    50000*sizeof(char),  log, &retsize);
        CL_CHECK_ERROR(err);
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        dumpPTXCode(fftCtx, fftProg, "oclFFT");
        exit(-1);
    }
    else {
        // dumpPTXCode(fftCtx, fftProg, "oclFFT");
    }

    // Create kernel for forward FFT
    fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err);
    CL_CHECK_ERROR(err);
    // Create kernel for inverse FFT
    ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err);
    CL_CHECK_ERROR(err);
    // Create kernel for check
    chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err);
    CL_CHECK_ERROR(err);
}
Example #25
0
int main(int argc, char** argv)
{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();

	cl_int err;

	size_t global_size;
	size_t local_size;

	cl_program program;
	cl_kernel kernel_compute_flux;
	cl_kernel kernel_compute_flux_contributions;
	cl_kernel kernel_compute_step_factor;
	cl_kernel kernel_time_step;
	cl_kernel kernel_initialize_variables;

	cl_mem ff_variable;
	cl_mem ff_fc_momentum_x;
	cl_mem ff_fc_momentum_y;
	cl_mem ff_fc_momentum_z;
	cl_mem ff_fc_density_energy;

	if (argc < 2)
	{
		printf("Usage ./cfd <data input file>\n");
		return 0;
	}


	const char* data_file_name = argv[1];


	// set far field conditions and load them into constant memory on the gpu
	{
		float h_ff_variable[NVAR];
		const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack);

		h_ff_variable[VAR_DENSITY] = (float)(1.4);

		float ff_pressure = (float)(1.0);
		float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
		float ff_speed = (float)(ff_mach)*ff_speed_of_sound;

		float3 ff_velocity;
		ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack));
		ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack));
		ff_velocity.z = 0.0;

		h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
		h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
		h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;

		h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0));

		float3 h_ff_momentum;
		h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
		h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
		h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
		float3 h_ff_fc_momentum_x;
		float3 h_ff_fc_momentum_y;
		float3 h_ff_fc_momentum_z;
		float3 h_ff_fc_density_energy;
		compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum,
				&h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity,
				&h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z,
				&h_ff_fc_density_energy);

		// copy far field conditions to the gpu
		ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err);
		CHKERR(err, "Unable to allocate ff data");
	}
	int nel;
	int nelr;

	// read in domain geometry
	cl_mem areas;
	cl_mem elements_surrounding_elements;
	cl_mem normals;
	{
		std::ifstream file(data_file_name);

		file >> nel;

		nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));

		float* h_areas = new float[nelr];
		int* h_elements_surrounding_elements = new int[nelr*NNB];
		float* h_normals = new float[nelr*NDIM*NNB];


		// read in data
		for(int i = 0; i < nel; i++)
		{
			file >> h_areas[i];
			for(int j = 0; j < NNB; j++)
			{
				file >> h_elements_surrounding_elements[i + j*nelr];
				if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
				h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering

				for(int k = 0; k < NDIM; k++)
				{
					file >> h_normals[i + (j + k*NNB)*nelr];
					h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
				}
			}
		}

		// fill in remaining data
		int last = nel-1;
		for(int i = nel; i < nelr; i++)
		{
			h_areas[i] = h_areas[last];
			for(int j = 0; j < NNB; j++)
			{
				// duplicate the last element
				h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];
				for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
			}
		}

		areas = alloc<float>(context, nelr);
		upload<float>(commands, areas, h_areas, nelr);

		elements_surrounding_elements = alloc<int>(context, nelr*NNB);
		upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);

		normals = alloc<float>(context, nelr*NDIM*NNB);
		upload<float>(commands, normals, h_normals, nelr*NDIM*NNB);

		delete[] h_areas;
		delete[] h_elements_surrounding_elements;
		delete[] h_normals;
	}

	// Get program source.
	long kernelSize = getKernelSize();
	char* kernelSource = new char[kernelSize];
	getKernelSource(kernelSource, kernelSize);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err);
	CHKERR(err, "Failed to create a compute program!");

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err == CL_BUILD_PROGRAM_FAILURE)
	{
		char *log;
		size_t logLen;
		err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen);
		log = (char *) malloc(sizeof(char)*logLen);
		err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL);
		fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log);
		free(log);
		exit(1);
	}
	CHKERR(err, "Failed to build program!");
	delete[] kernelSource;

	// Create the compute kernel in the program we wish to run
	kernel_compute_flux = clCreateKernel(program, "compute_flux", &err);
	CHKERR(err, "Failed to create a compute kernel!");

	// Create the reduce kernel in the program we wish to run
	kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err);
	CHKERR(err, "Failed to create a compute_flux_contributions kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err);
	CHKERR(err, "Failed to create a compute_step_factor kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_time_step = clCreateKernel(program, "time_step", &err);
	CHKERR(err, "Failed to create a time_step kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err);
	CHKERR(err, "Failed to create a initialize_variables kernel!");

	// Create arrays and set initial conditions
	cl_mem variables = alloc<cl_float>(context, nelr*NVAR);

	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	//err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	local_size = 1;//std::min(local_size, (size_t)nelr);
	global_size = nelr;
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	err = clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0");


	cl_mem old_variables = alloc<float>(context, nelr*NVAR);
	cl_mem fluxes = alloc<float>(context, nelr*NVAR);
	cl_mem step_factors = alloc<float>(context, nelr);
	clFinish(commands);
	cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM);
	cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM);
	clFinish(commands);

	// make sure all memory is floatly allocated before we start timing
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1");
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");

	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2");
	std::cout << "About to memcopy" << std::endl;
	err = clReleaseMemObject(step_factors);
	float temp[nelr];
	for(int i = 0; i < nelr; i++)
		temp[i] = 0;
	step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err);
	CHKERR(err, "Unable to memset step_factors");
	// make sure CUDA isn't still doing something before we start timing

	clFinish(commands);

	// these need to be computed the first time in order to compute time step
	std::cout << "Starting..." << std::endl;


	// Begin iterations
	for(int i = 0; i < iterations; i++)
	{
		copy<float>(commands, old_variables, variables, nelr*NVAR);

		// for the first iteration we compute the time step
		err = 0;
		err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr);
		err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables);
		err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas);
		err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors);
		CHKERR(err, "Failed to set kernel arguments!");
		// Get the maximum work group size for executing the kernel on the device
		err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
		CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");
		err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!");
		for(int j = 0; j < RK; j++)
		{
			err = 0;
			err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer)
			//compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy);
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!");
			err = 0;
			err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements);
			err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals);
			err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy);
			err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes);
			err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable);
			err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!");
			err = 0;
			err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j);
			err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables);
			err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors);
			err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_time_step work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_time_step]!");
		}
	}

	clFinish(commands);
	std::cout << "Finished" << std::endl;
	std::cout << "Saving solution..." << std::endl;
	dump(commands, variables, nel, nelr);
	std::cout << "Saved solution..." << std::endl;
	std::cout << "Cleaning up..." << std::endl;

	clReleaseProgram(program);
	clReleaseKernel(kernel_compute_flux);
	clReleaseKernel(kernel_compute_flux_contributions);
	clReleaseKernel(kernel_compute_step_factor);
	clReleaseKernel(kernel_time_step);
	clReleaseKernel(kernel_initialize_variables);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	dealloc<float>(areas);
	dealloc<int>(elements_surrounding_elements);
	dealloc<float>(normals);

	dealloc<float>(variables);
	dealloc<float>(old_variables);
	dealloc<float>(fluxes);
	dealloc<float>(step_factors);
	dealloc<float>(fc_momentum_x);
	dealloc<float>(fc_momentum_y);
	dealloc<float>(fc_momentum_z);
	dealloc<float>(fc_density_energy);

	std::cout << "Done..." << std::endl;
	ocd_finalize();
	return 0;
}
Example #26
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls

    float *cpu_xyz;						//calculate the results on the CPU
    float *gpu_xyz;						//calculate the results on the GPU
    unsigned int correct;               // number of correct results returned

    char* kernel_source;				//kernel source code

    cl_platform_id platform_id;			// compute platform id
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;        			// compute kernel code

    //stuff were going to query
    cl_int preferred_workgroup_size;
    cl_int max_workgroup_size;

    cl_mem cl_output;                      // device memory used for the output array
    int i, j, k;

    if(!cl_load("main.cl", &kernel_source))
    {
    	printf("Your file didn't load.");
    	return 1;
    }

    int theta, phi, r;
    for(i=0; i<V_THETA_MAX; i++)
    {
    	theta = i*V_THETA_INC;
    	for(j=0; j<V_PHI_MAX; j++)
    	{
    		phi = j*V_PHI_INC;
    		for(k=0; k<V_R_MAX; k++)
    		{
    			r = k*V_R_INC;
    		}
    	}
    }

    //get the platform information.  This corresponds to vendor implementations of opencl
    cl_uint platforms;
    err = clGetPlatformIDs(1, &platform_id, &platforms);
    if(err != CL_SUCCESS)
    {
    	printf("Error: Failed to query platform ids!\n");
		return EXIT_FAILURE;
    }

    // Connect to a compute device.  A GPU in this case.
    //
    cl_uint num_devices;
    err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }

    // Create a compute context.  A handle to the combination of platform and device.
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue.  We will use this to send work to the CPU.
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // create a program, given the loaded source code.
    //
    program = clCreateProgramWithSource(context, 1, (const char **) &kernel_source, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Compile the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // create (select) a kernel function from the compiled program
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

    // Get the preferred workgroup size multiple
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Get the preferred workgroup size
	//
	err = clGetKernelWorkGroupInfo(kernel, device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to retrieve kernel work group info! %d\n", err);
		exit(1);
	}

    // Create the output array in device memory for our calculation
    //
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }

    // Write our data set into the input array in device memory
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
//    global = count;
    global = local;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    // Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);

    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }

    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        if((results[i]) == data[i] * data[i])
            correct++;
    }

    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);

    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return 0;
}
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads)
{
	/* initialize highlighting */
	if (!g_highlightInitialized) {
		if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead);
		if (g_highlightedNodes)     MEM_freeN(g_highlightedNodes);

		g_highlightedNodesRead = NULL;
		g_highlightedNodes = NULL;

		COM_startReadHighlights();

		g_highlightInitialized = true;
	}

#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
	/* deinitialize if number of threads doesn't match */
	if (g_cpudevices.size() != num_cpu_threads) {
		Device *device;

		while (g_cpudevices.size() > 0) {
			device = g_cpudevices.back();
			g_cpudevices.pop_back();
			device->deinitialize();
			delete device;
		}

		g_cpuInitialized = false;
	}

	/* initialize CPU threads */
	if (!g_cpuInitialized) {
		for (int index = 0; index < num_cpu_threads; index++) {
			CPUDevice *device = new CPUDevice();
			device->initialize();
			g_cpudevices.push_back(device);
		}

		g_cpuInitialized = true;
	}

#ifdef COM_OPENCL_ENABLED
	/* deinitialize OpenCL GPU's */
	if (use_opencl && !g_openclInitialized) {
		g_context = NULL;
		g_program = NULL;

		if (clewInit() != CLEW_SUCCESS) /* this will check for errors and skip if already initialized */
			return;

		if (clCreateContextFromType) {
			cl_uint numberOfPlatforms = 0;
			cl_int error;
			error = clGetPlatformIDs(0, 0, &numberOfPlatforms);
			if (error == -1001) { }   /* GPU not supported */
			else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
			if (G.f & G_DEBUG) printf("%u number of platforms\n", numberOfPlatforms);
			cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__);
			error = clGetPlatformIDs(numberOfPlatforms, platforms, 0);
			unsigned int indexPlatform;
			for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) {
				cl_platform_id platform = platforms[indexPlatform];
				cl_uint numberOfDevices = 0;
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices);
				if (numberOfDevices <= 0)
					continue;

				cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__);
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0);

				g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error);
				if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
				const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL};
				g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error);
				error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0);
				if (error != CL_SUCCESS) {
					cl_int error2;
					size_t ret_val_size = 0;
					printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__);
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					build_log[ret_val_size] = '\0';
					printf("%s", build_log);
					MEM_freeN(build_log);
				}
				else {
					unsigned int indexDevices;
					for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) {
						cl_device_id device = cldevices[indexDevices];
						cl_int vendorID = 0;
						cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL);
						if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); }
						OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID);
						clDevice->initialize();
						g_gpudevices.push_back(clDevice);
					}
				}
				MEM_freeN(cldevices);
			}
			MEM_freeN(platforms);
		}

		g_openclInitialized = true;
	}
#endif
#endif
}
int main(int argc, char *argv[]){
    cl_uint numPlatforms;
    cl_platform_id* clSelectedPlatformID = NULL;  
	int err;                            // error code returned from api calls
      
    int data[DATA_SIZE];              // original data set given to device
    int results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned
    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation
 
    cl_device_id device_id;              
    cl_context context;                 
    cl_command_queue commands;          
    cl_program program;                 
    cl_kernel kernel;                   
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    if(parseArgs(argc, argv)){
        return 0;
    }
    // Fill our data set with random int values
    unsigned int count = DATA_SIZE;
    

	////////////////////////////////////////////////////////////////////////////////
	 
	// Simple compute kernel which computes the collatz of an input array 
	//
	const char *KernelSource = fileToString("gpuFunctions.c");
    
	//get Platform
	clGetPlatformIDs(0, NULL, &numPlatforms);
	clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms);
    err = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL);

    //get Device
    err = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
    	printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }

    //create context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

     // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }
 
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
 
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }
 
    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "allToOne", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }


    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }
    timer t = createTimer();
    for(int i =0;i<rep;i++){
        initData(data);
        // Write our data set into the input array in device memory 
        //
        err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to write to source array!\n");
            exit(1);
        }
     
        // Set the arguments to our compute kernel
        //
        err = 0;
        err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
        err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to set kernel arguments! %d\n", err);
            exit(1);
        }
     
        // Get the maximum work group size for executing the kernel on the device
        //
        err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to retrieve kernel work group info! %d\n", err);
            exit(1);
        }
     
        // Execute the kernel over the entire range of our 1d input data set
        // using the maximum number of work group items for this device
        //
        
        global = count;
        err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
        if (err)
        {
            printf("Error: Failed to execute kernel!\n");
            return EXIT_FAILURE;
        }
     
        // Wait for the command commands to get serviced before reading back results
        //
        clFinish(commands);
     
        // Read back the results from the device to verify the output
        //
        err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );  
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read output array! %d\n", err);
            exit(1);
        }
    }
    double timeEnd = getTime(t);
    
    // Validate our results
    //
    correct = 0;
    for(int i = 0; i < arraySize; i++)
    {
        if(results[i] >= 0){
            correct++;
            if(i==0){
               printf("%d",results[i]);
            }else{
               printf(",%d",results[i]);
            }
        }
    }
    printf("\n");
    

    // Print a brief summary detailing the results
    printf("Computed '%d/%d' values to 1!\n", correct, arraySize);
    printf("TIME- %f\n",timeEnd);
    
    // Shutdown and cleanup
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
	return 0;
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'signbit_float16.cl' */
        source_code = read_buffer("signbit_float16.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "signbit_float16", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_float16 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_float16));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_float16){{2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float16), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_int16 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_int16));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Example #30
0
void opencl_init(void) {

	// get the platform

	cl_uint num_platforms;
	clError = clGetPlatformIDs(0, NULL, &num_platforms);
	checkErr(clError, "clGetPlatformIDs( 0, NULL, &num_platforms );");

	if (num_platforms <= 0) {
		std::cout << "No platform..." << std::endl;
		exit(1);
	}

	cl_platform_id* platforms = new cl_platform_id[num_platforms];
	clError = clGetPlatformIDs(num_platforms, platforms, NULL);
	checkErr(clError, "clGetPlatformIDs( num_platforms, &platforms, NULL );");
	if (num_platforms > 1) {
		char platformName[256];
		clError = clGetPlatformInfo(platforms[0], CL_PLATFORM_VENDOR,
				sizeof(platformName), platformName, NULL);
		std::cerr << "Multiple platforms found defaulting to: " << platformName
				<< std::endl;
	}
	platform_id = platforms[0];
	if (getenv("OPENCL_PLATEFORM"))
		platform_id = platforms[1];
	delete platforms;

	// Connect to a compute device
	//
	cl_uint device_count = 0;
	clError = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, NULL,
			&device_count);
	checkErr(clError, "Failed to create a device group");
	cl_device_id* deviceIds = (cl_device_id*) malloc(
			sizeof(cl_device_id) * device_count);
	clError = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, device_count,
			deviceIds, NULL);
	if (device_count > 1) {
		char device_name[256];
		int compute_units;
		clError = clGetDeviceInfo(deviceIds[0], CL_DEVICE_NAME,
				sizeof(device_name), device_name, NULL);
		checkErr(clError, "clGetDeviceInfo failed");
		clError = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS,
				sizeof(cl_uint), &compute_units, NULL);
		checkErr(clError, "clGetDeviceInfo failed");
		std::cerr << "Multiple devices found defaulting to: " << device_name;
		std::cerr << " with " << compute_units << " compute units" << std::endl;
	}
	device_id = deviceIds[0];
	delete deviceIds;
	// Create a compute context 
	//
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &clError);
	checkErr(clError, "Failed to create a compute context!");

	// Create a command commands
	//
	commandQueue = clCreateCommandQueue(context, device_id, 0, &clError);
	checkErr(clError, "Failed to create a command commands!");

	// READ KERNEL FILENAME
	std::string filename = "NOTDEFINED.cl";
	char const* tmp_name = getenv("OPENCL_KERNEL");
	if (tmp_name) {
		filename = std::string(tmp_name);
	} else {
		filename = std::string(__FILE__);
		filename = filename.substr(0, filename.length() - 17);
		filename += "/kernels.cl";

	}

	// READ OPENCL_PARAMETERS
	std::string compile_parameters = "";
	char const* tmp_params = getenv("OPENCL_PARAMETERS");
	if (tmp_params) {
		compile_parameters = std::string(tmp_params);
	}

	std::ifstream kernelFile(filename.c_str(), std::ios::in);

	if (!kernelFile.is_open()) {
		std::cout << "Unable to open " << filename << ". " << __FILE__ << ":"
				<< __LINE__ << "Please set OPENCL_KERNEL" << std::endl;
		exit(1);
	}

	/*
	 * Read the kernel file into an output stream.
	 * Convert this into a char array for passing to OpenCL.
	 */
	std::ostringstream outputStringStream;
	outputStringStream << kernelFile.rdbuf();
	std::string srcStdStr = outputStringStream.str();
	const char* charSource = srcStdStr.c_str();

	kernelFile.close();
	// Create the compute program from the source buffer
	//
	program = clCreateProgramWithSource(context, 1, (const char **) &charSource,
			NULL, &clError);
	if (!program) {
		printf("Error: Failed to create compute program!\n");
		exit(1);
	}

	// Build the program executable
	//
	clError = clBuildProgram(program, 0, NULL, compile_parameters.c_str(), NULL,
			NULL);

	/* Get the size of the build log. */
	size_t logSize = 0;
	clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
			&logSize);

	if (clError != CL_SUCCESS) {
		if (logSize > 1) {
			char* log = new char[logSize];
			clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
					logSize, log, NULL);

			std::string stringChars(log, logSize);
			std::cerr << "Build log:\n " << stringChars << std::endl;

			delete[] log;
		}
		printf("Error: Failed to build program executable!\n");
		exit(1);
	}

	return;

}