Esempio n. 1
0
int
main(int argc, char **argv)
{
    cl_uint num;
    cl_int err;
    int platform_idx = -1;
    cl_platform_id *plat_ids;
    int i;
    size_t sz;
    cl_device_id *gpu_devs;
    cl_context_properties cps[3];
    cl_context context;
    int opt;
    char *input;
    int run_size = 1024;
    struct AIISA_Program prog;
    cl_command_queue queue;
    int ei;
    int nloop = 16;
    struct AIISA_CodeBuffer buf;

    aiisa_code_buffer_init(&buf);

    clGetPlatformIDs(0, NULL, &num);

    plat_ids = (cl_platform_id*)malloc(sizeof(*plat_ids) * num);
    clGetPlatformIDs(num, plat_ids, NULL);

    while ((opt = getopt(argc, argv, "n:")) != -1) {
        switch (opt) {
        case 'n':
            run_size = atoi(optarg);
            break;

        default:
            puts("usage : run in.cl");
            return 1;
        }
    }

    if (optind >= argc) {
        puts("usage : run in.cl");
        return 1;
    }

    input = argv[optind];

    for (i=0; i<(int)num; i++) {
        char name[1024];
        size_t len;
        clGetPlatformInfo(plat_ids[i], CL_PLATFORM_VENDOR, sizeof(name), name, &len);

        //puts(name);
        if (strcmp(name, "Advanced Micro Devices, Inc.") == 0) {
            platform_idx = i;
            break;
        }
    }

    if (platform_idx == -1) {
        puts("no amd");
        return -1;
    }

    clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &num);
    if (num == 0) {
        puts("no gpu");
        return -1;
    }

    gpu_devs = (cl_device_id*)malloc(sizeof(gpu_devs[0]) * 1);
    //clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, num, gpu_devs, NULL);

    cps[0] = CL_CONTEXT_PLATFORM;
    cps[1] = (cl_context_properties)plat_ids[platform_idx];
    cps[2] = 0;

    context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
    clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(gpu_devs), gpu_devs, &sz);

    queue = clCreateCommandQueue(context, gpu_devs[0], 0, NULL);

    {
        char name[1024];
        size_t sz;
        clGetDeviceInfo(gpu_devs[0], CL_DEVICE_NAME, sizeof(name), name, &sz);

        puts(name);
    }

    //puts(input);

    aiisa_build_binary_from_cl(&prog, context, gpu_devs[0], input);

    for (ei=0; ei<nloop; ei++) {
        cl_program cl_prog;
        const unsigned char *bin[1];
        size_t bin_size[1];
        cl_kernel ker;
        cl_mem in, out;
        size_t global_size[3];
        double tb, te;

        tb = sec();
        gen_code(&prog, &buf);
        bin[0] = prog.cl_binary;
        bin_size[0] = prog.size;
        cl_prog = clCreateProgramWithBinary(context, 1, gpu_devs, bin_size, bin, NULL, NULL);
        clBuildProgram(cl_prog, 1, gpu_devs, NULL, NULL, NULL);
        ker = clCreateKernel(cl_prog, "f", &err);
        te = sec();
        printf("build : %f[usec]\n", (te-tb)*1000000);

        in = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err);
        out = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err);

        clSetKernelArg(ker, 0, sizeof(cl_mem), &in);
        clSetKernelArg(ker, 1, sizeof(cl_mem), &out);


        {
            int *ptr = (int*)clEnqueueMapBuffer(queue, in, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                ptr[i] = i;
            }
            clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL);
        }

        {
            int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                ptr[i] = 0xdeadbeef;
            }
            clEnqueueUnmapMemObject(queue, out, ptr, 0, NULL, NULL);
        }

        err = clFinish(queue);

        global_size[0] = run_size;
        err = clEnqueueNDRangeKernel(queue, ker, 1, NULL, global_size, NULL, 0, NULL, NULL);
        if (err != CL_SUCCESS) {
            puts("enqueue nd");
        }
        err = clFinish(queue);
        if (err != CL_SUCCESS) {
            puts("fini");
        }

        if (ei == 0) {
            int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_READ, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                printf("%d : %x\n", i, ptr[i]);
            }
            clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL);
        }

        err = clFinish(queue);

        clReleaseMemObject(in);
        clReleaseMemObject(out);
        clReleaseKernel(ker);
        clReleaseProgram(cl_prog);
    }

    return 0;
}
Esempio n. 2
0
void cape::load(int team)
{
    saved_team = team;

    hit_floor = false;

    death_height_offset = 0.f;

    model = cpu_context->make_new();

    model->set_load_func(std::bind(cape::load_cape, std::placeholders::_1, team));
    model->set_active(true);
    model->cache = false; ///why?
    //model->set_normal("res/norm_body.png");

    //obj_mem_manager::load_active_objects();

    cpu_context->load_active();

    model->set_two_sided(true);
    model->set_specular(0.7);

    //obj_mem_manager::g_arrange_mem();
    //obj_mem_manager::g_changeover();

    cpu_context->build();
    gpu_context = cpu_context->fetch();

    which = 0;

    in = compute::buffer(cl::context, sizeof(float)*width*height*depth*3, CL_MEM_READ_WRITE, nullptr);
    out = compute::buffer(cl::context, sizeof(float)*width*height*depth*3, CL_MEM_READ_WRITE, nullptr);

    cl_float* inmap = (cl_float*) clEnqueueMapBuffer(cl::cqueue.get(), in.get(), CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*width*height*depth*3, 0, NULL, NULL, NULL);
    cl_float* outmap = (cl_float*) clEnqueueMapBuffer(cl::cqueue.get(), out.get(), CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*width*height*depth*3, 0, NULL, NULL, NULL);

    const float separation = 10.f;

    for(int j=0; j<height; j++)
    {
        for(int i=0; i<width; i++)
        {
            float xpos = i * separation;
            float ypos = j * separation;
            float zpos = 0;

            inmap[(i + j*width)*3 + 0] = xpos;
            inmap[(i + j*width)*3 + 1] = ypos;
            inmap[(i + j*width)*3 + 2] = zpos;

            outmap[(i + j*width)*3 + 0] = xpos;
            outmap[(i + j*width)*3 + 1] = ypos;
            outmap[(i + j*width)*3 + 2] = zpos;
        }
    }

    clEnqueueUnmapMemObject(cl::cqueue.get(), in.get(), inmap, 0, NULL, NULL);
    clEnqueueUnmapMemObject(cl::cqueue.get(), out.get(), outmap, 0, NULL, NULL);

    loaded = true;

    context_id = cpu_context->get_context_id();
}
Esempio n. 3
0
int main(int argc, char *argv[])
{
    int myid, numprocs, i, j;
    int size, align_size;

// host buffer
    char *s_buf, *r_buf, *s_buf1, *r_buf1;
    double t_start = 0.0, t_end = 0.0, t = 0.0;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);

    align_size = getpagesize();
    assert(align_size <= MAX_ALIGNMENT);

#ifdef PINNED
   // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    err_status(ret);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1,
            &device_id, &ret_num_devices);
    err_status(ret);

    printf("%d device(s) in %d platform(s)\n",ret_num_devices, ret_num_platforms);
    char cBuffer[1024];
    ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer,
NULL);
    err_status(ret);
    printf("CL_DEVICE_NAME:       %s\n", cBuffer);

    // Create an OpenCL context
    cl_context context = clCreateContext (NULL,
                                          1, 
                                          &device_id, 
                                          NULL, 
                                          NULL,
                                          &ret);
    err_status(ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue (context, 
                                                           device_id, 
                                                           0,
                                                           &ret);
    err_status(ret);

    // Create memory buffers on the device
    cl_mem s_mem = clCreateBuffer(context, 
                                  CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                  // CL_MEM_COPY_HOST_PTR is only valid with non-NULL pointer
                                  MYBUFSIZE, 
                                  NULL, 
                                  &ret);
    err_status(ret);
    cl_mem r_mem = clCreateBuffer(context,
                                  CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 
                                  // CL_MEM_COPY_HOST_PTR is only valid with non-NULL pointer
                                  MYBUFSIZE, 
                                  NULL, 
                                  &ret);
    err_status(ret);

   // pinned memory (blocked call)
   s_buf1 = (char *) clEnqueueMapBuffer(command_queue,
                                        s_mem, 
                                        CL_TRUE,
                                        CL_MAP_WRITE,
                                        0, 
                                        MYBUFSIZE, 
                                        0,
                                        NULL,
                                        NULL,
                                        &ret);
   err_status(ret);
   r_buf1 = (char *) clEnqueueMapBuffer(command_queue,
                                        r_mem, 
                                        CL_TRUE,
                                        CL_MAP_WRITE, 
                                        0, 
                                        MYBUFSIZE, 
                                        0,
                                        NULL, 
                                        NULL, 
                                        &ret);
   err_status(ret);

#else
    if (myid == 0) printf("# Using PAGEABLE host memory!\n");
    s_buf1 = (char*) malloc(MYBUFSIZE);
    r_buf1 = (char*) malloc(MYBUFSIZE);
#endif

    s_buf =
        (char *) (((unsigned long) s_buf1 + (align_size - 1)) /
                  align_size * align_size);
    r_buf =
        (char *) (((unsigned long) r_buf1 + (align_size - 1)) /
                  align_size * align_size);

    if(numprocs != 2) {
        if(myid == 0) {
            fprintf(stderr, "This test requires exactly two processes\n");
        }

        MPI_Finalize();

        return EXIT_FAILURE;
    }

    if(myid == 0) {
        fprintf(stdout, "# %s\n", BENCHMARK);
        fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH,
                "Bandwidth (MB/s)");
        fflush(stdout);
    }

    /* Bandwidth test */
    for(size = 1; size <= MAX_MSG_SIZE; size *= 2) {
        /* touch the data */
        for(i = 0; i < size; i++) {
            s_buf[i] = 'a';
            r_buf[i] = 'b';
        }
        //   puts("2");
        if(size > large_message_size) {
            loop = loop_large;
            skip = skip_large;
            window_size = window_size_large;
        }

        if(myid == 0) {
            for(i = 0; i < loop + skip; i++) {
                if(i == skip) {
                    t_start = MPI_Wtime();
                }

                for(j = 0; j < window_size; j++) {
                    MPI_Isend(s_buf, size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
                            request + j);
                }

                MPI_Waitall(window_size, request, reqstat);
                MPI_Recv(r_buf, 4, MPI_CHAR, 1, 101, MPI_COMM_WORLD,
                        &reqstat[0]);
            }

            t_end = MPI_Wtime();
            // printf("%d %d\n",myid,size);
            t = t_end - t_start;
        }

        else if(myid == 1) {
            for(i = 0; i < loop + skip; i++) {
                for(j = 0; j < window_size; j++) {
                    MPI_Irecv(r_buf, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
                            request + j);
                }

                MPI_Waitall(window_size, request, reqstat);
                MPI_Send(s_buf, 4, MPI_CHAR, 0, 101, MPI_COMM_WORLD);
            }
            // printf("%d %d\n",myid,size);
        }

        if(myid == 0) {
            double tmp = size / 1e6 * loop * window_size;

            fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH,
                    FLOAT_PRECISION, tmp / t);
            fflush(stdout);
        }
    }

#ifdef PINNED
//    cudaFree(s_buf1);
//    cudaFree(r_buf1);
//   clReleaseMemObject(s_mem);
//   clReleaseMemObject(r_mem);

#else
    free(s_buf1);
    free(r_buf1);
#endif

    MPI_Finalize();

    return EXIT_SUCCESS;
}
Esempio n. 4
0
extern "C" magma_err_t
magma_dpotrf_msub(int num_subs, int num_gpus, magma_uplo_t uplo, magma_int_t n, 
                  magmaDouble_ptr *d_lA, size_t dA_offset, 
                  magma_int_t ldda, magma_int_t *info, 
                  magma_queue_t *queues)
{
/*  -- clMAGMA (version 1.1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2014

    Purpose   
    =======   
    DPOTRF computes the Cholesky factorization of a real symmetric   
    positive definite matrix dA.   

    The factorization has the form   
       dA = U**T * U,  if UPLO = 'U', or   
       dA = L  * L**T,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)   
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**T * U or dA = L * L**T.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */


    int tot_subs = num_subs * num_gpus;
    magma_err_t err;
    magma_int_t j, nb, d, lddp, h;
    double *work;
    magmaDouble_ptr dwork[MagmaMaxGPUs];

    *info = 0;
    nb = magma_get_dpotrf_nb(n);
    if ( uplo != MagmaUpper && uplo != MagmaLower ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (uplo != MagmaUpper) {
        lddp = nb*(n/(nb*tot_subs));
        if( n%(nb*tot_subs) != 0 ) lddp+=min(nb,n-tot_subs*lddp);
        if( ldda < lddp ) *info = -4;
    } else if( ldda < n ) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (num_gpus == 1 && ((nb <= 1) || (nb >= n)) ) {
        /*  Use unblocked code. */
        err = magma_dmalloc_cpu( &work, n*nb );
        if (err != MAGMA_SUCCESS) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_dgetmatrix( n, n, d_lA[0], 0, ldda, work, 0, n, queues[0] );
        lapackf77_dpotrf(lapack_uplo_const(uplo), &n, work, &n, info);
        magma_dsetmatrix( n, n, work, 0, n, d_lA[0], 0, ldda, queues[0] );
        magma_free_cpu( work );
    } else {
        lddp = 32*((n+31)/32);
        for (d=0; d<num_gpus; d++) {
            if (MAGMA_SUCCESS != magma_dmalloc( &dwork[d], num_gpus*nb*lddp )) {
                for( j=0; j<d; j++ ) magma_free( dwork[j] );
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
        }
        h = 1; //num_gpus; //(n+nb-1)/nb;
        #ifdef USE_PINNED_CLMEMORY
        cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(double)*n*nb*h, NULL, NULL);
        for (d=0; d<num_gpus; d++) {
            work = (double*)clEnqueueMapBuffer(queues[2*d], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 
                                                           sizeof(double)*n*nb*h, 0, NULL, NULL, NULL);
        }
        #else
        if (MAGMA_SUCCESS != magma_dmalloc_cpu( &work, n*nb*h )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        #endif
        if (uplo == MagmaUpper) {
            /* with two queues for each device */
            magma_dpotrf2_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, 
                               dwork, lddp, work, n, h, info, queues);
            //magma_dpotrf3_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, 
            //                   dwork, lddp, work, n, h, info, queues);
            /* with three streams */
            //magma_dpotrf3_msub(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n,  
            //                   h, stream, event, info);
        } else {
            /* with two queues for each device */
            magma_dpotrf2_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, 
                               dwork, lddp, work, nb*h, h, info, queues);
            //magma_dpotrf3_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, 
            //                   dwork, lddp, work, nb*h, h, info, queues);
            //magma_dpotrf4_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, 
            //                   dwork, lddp, work, nb*h, h, info, queues);
            /* with three streams */
            //magma_dpotrf3_msub(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, 
            //                   h, stream, event, info);
        }

        /* clean up */
        for (d=0; d<num_gpus; d++) magma_free( dwork[d] );
        #ifdef USE_PINNED_CLMEMORY
        for (d=0; d<num_gpus; d++) {
            clEnqueueUnmapMemObject(queues[2*d], buffer, work, 0, NULL, NULL);
        }
        clReleaseMemObject( buffer );
        #else
        magma_free_cpu( work );
        #endif
    } /* end of not lapack */

    return *info;
} /* magma_dpotrf_msub */
Esempio n. 5
0
extern "C" void mixbenchGPU(cl_device_id dev_id, double *c, long size, bool block_strided, bool host_allocated, size_t workgroupsize, unsigned int elements_per_wi, unsigned int fusion_degree) {
    const char *benchtype;
    if(block_strided)
        benchtype = "Workgroup";
    else
        benchtype = "NDRange";
    printf("Workitem stride:        %s\n", benchtype);
    const char *buffer_allocation = host_allocated ? "Host allocated" : "Device allocated";
    printf("Buffer allocation:      %s\n", buffer_allocation);

    // Set context properties
    cl_platform_id p_id;
    OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_PLATFORM, sizeof(p_id), &p_id, NULL) );
    size_t length;
    OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, 0, NULL, &length) );
    char *extensions = (char*)alloca(length);
    OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, length, extensions, NULL) );
    bool enable_dp = strstr(extensions, "cl_khr_fp64") != NULL;

    cl_context_properties ctxProps[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)p_id, 0 };

    cl_int errno;
    // Create context
    cl_context context = clCreateContext(ctxProps, 1, &dev_id, NULL, NULL, &errno);
    OCL_SAFE_CALL(errno);

    cl_mem_flags buf_flags = CL_MEM_READ_WRITE;
    if( host_allocated )
        buf_flags |= CL_MEM_ALLOC_HOST_PTR;
    cl_mem c_buffer = clCreateBuffer(context, buf_flags, size*sizeof(double), NULL, &errno);
    OCL_SAFE_CALL(errno);

    // Create command queue
    cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, CL_QUEUE_PROFILING_ENABLE, &errno);
    OCL_SAFE_CALL(errno);

    // Set data on device memory
    cl_int *mapped_data = (cl_int*)clEnqueueMapBuffer(cmd_queue, c_buffer, CL_TRUE, CL_MAP_WRITE, 0, size*sizeof(double), 0, NULL, NULL, &errno);
    OCL_SAFE_CALL(errno);
    for(int i=0; i<size; i++)
        mapped_data[i] = 0;
    clEnqueueUnmapMemObject(cmd_queue, c_buffer, mapped_data, 0, NULL, NULL);

    // Load source, create program and all kernels
    printf("Loading kernel source file...\n");
    const char c_param_format_str[] = "-cl-std=CL1.1 -cl-mad-enable -Dclass_T=%s -Dblockdim=" SIZE_T_FORMAT " -DCOMPUTE_ITERATIONS=%d -DELEMENTS_PER_THREAD=%d -DFUSION_DEGREE=%d %s %s";
    const char *c_empty = "";
    const char *c_striding = block_strided ? "-DBLOCK_STRIDED" : c_empty;
    const char *c_enable_dp = "-DENABLE_DP";
    char c_build_params[256];
    const char *c_kernel_source = {ReadFile("mix_kernels_ro.cl")};
    printf("Precompilation of kernels... ");
    sprintf(c_build_params, c_param_format_str, "short", workgroupsize, 0, 1, 1, c_striding, c_empty);

    cl_kernel kernel_warmup = BuildKernel(context, dev_id, c_kernel_source, c_build_params);

    show_progress_init(compute_iterations_len);
    cl_kernel kernels[kdt_double+1][compute_iterations_len];
    for(int i=0; i<compute_iterations_len; i++) {
        show_progress_step(0, '\\');
        sprintf(c_build_params, c_param_format_str, "float", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty);
        //printf("%s\n",c_build_params);
        kernels[kdt_float][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params);

        show_progress_step(0, '|');
        sprintf(c_build_params, c_param_format_str, "int", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty);
        //printf("%s\n",c_build_params);
        kernels[kdt_int][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params);

        if( enable_dp ) {
            show_progress_step(0, '/');
            sprintf(c_build_params, c_param_format_str, "double", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_enable_dp);
            //printf("%s\n",c_build_params);
            kernels[kdt_double][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params);
        } else
            kernels[kdt_double][i] = 0;
        show_progress_step(1, '>');
    }
    show_progress_done();
    free((char*)c_kernel_source);

    runbench_warmup(cmd_queue, kernel_warmup, c_buffer, size, workgroupsize);

    // Synchronize in order to wait for memory operations to finish
    OCL_SAFE_CALL( clFinish(cmd_queue) );

    printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
    printf("Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Integer operations,,, \n");
    printf("Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec\n");

    for(int i=0; i<compute_iterations_len; i++)
        runbench(compute_iterations, i, cmd_queue, kernels, c_buffer, size, workgroupsize, elements_per_wi, fusion_degree);

    printf("------------------------------------------------------------------------------------------------------------------------------\n");

    // Copy results back to host memory
    OCL_SAFE_CALL( clEnqueueReadBuffer(cmd_queue, c_buffer, CL_TRUE, 0, size*sizeof(double), c, 0, NULL, NULL) );

    // Release kernels and program
    ReleaseKernelNProgram(kernel_warmup);
    for(int i=0; i<compute_iterations_len; i++) {
        ReleaseKernelNProgram(kernels[kdt_float][i]);
        ReleaseKernelNProgram(kernels[kdt_int][i]);
        if( enable_dp )
            ReleaseKernelNProgram(kernels[kdt_double][i]);
    }

    // Release buffer
    OCL_SAFE_CALL( clReleaseMemObject(c_buffer) );
}
int 
BinomialOption::runCLKernels()
{
    cl_int status;
    cl_event ndrEvt;
    cl_int eventStatus = CL_QUEUED;

    cl_event inMapEvt;
    void* mapPtr = clEnqueueMapBuffer(commandQueue, 
                                      randBuffer, 
                                      CL_FALSE, 
                                      CL_MAP_WRITE, 
                                      0,
                                      numSamples * sizeof(cl_float4),
                                      0,
                                      NULL,
                                      &inMapEvt,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer failed. (inputBuffer)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&inMapEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt) Failed");

    memcpy(mapPtr, randArray, numSamples * sizeof(cl_float4));

    cl_event inUnmapEvent;

    status = clEnqueueUnmapMemObject(commandQueue,
                                    randBuffer,
                                    mapPtr,
                                    0,
                                    NULL,
                                    &inUnmapEvent);
    CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject failed. (randBuffer)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&inUnmapEvent);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inUnmapEvent) Failed");

    // Set appropriate arguments to the kernel 
    status = clSetKernelArg(kernel,
                            0, 
                            sizeof(int), 
                            (void*)&numSteps);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(numSteps) failed.");

    status = clSetKernelArg(kernel, 
                            1, 
                            sizeof(cl_mem), 
                            (void*)&randBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuffer) failed.");

    status = clSetKernelArg(kernel, 
                            2, 
                            sizeof(cl_mem), 
                            (void*)&outBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(outBuffer) failed.");

    status = clSetKernelArg(kernel,
                            3,
                            (numSteps + 1) * sizeof(cl_float4),
                            NULL);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(callA) failed.");

    status = clSetKernelArg(kernel,
                            4,
                            numSteps * sizeof(cl_float4),
                            NULL);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(callB) failed.");

    // Enqueue a kernel run call.
    size_t globalThreads[] = {numSamples * (numSteps + 1)};
    size_t localThreads[] = {numSteps + 1};

    if(localThreads[0] > deviceInfo.maxWorkItemSizes[0] || localThreads[0] > deviceInfo.maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device does not support"
            "requested number of work items.";
        return SDK_FAILURE;
    }
    
    if(kernelInfo.localMemoryUsed > deviceInfo.localMemSize)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_FAILURE;
    }

    /**
     * This algorithm reduces each group of work-items to a single value
     * on OpenCL device
     */
    
    status = clEnqueueNDRangeKernel(
                    commandQueue,
                    kernel,
                    1,
                    NULL,
                    globalThreads,
                    localThreads,
                    0,
                    NULL,
                    &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush() failed.");

    status = sampleCommon->waitForEventAndRelease(&ndrEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed");
   
    cl_event outMapEvt;
    cl_uint* outMapPtr = (cl_uint*)clEnqueueMapBuffer(commandQueue,
                                                   outBuffer,
                                                   CL_FALSE,
                                                   CL_MAP_READ,
                                                   0,
                                                   numSamples * sizeof(cl_float4),
                                                   0,
                                                   NULL,
                                                   &outMapEvt,
                                                   &status);
    CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(outputBuffer) failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&outMapEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outMapEvt) Failed");
    memcpy(output, outMapPtr, numSamples * sizeof(cl_float4));

    cl_event outUnmapEvt;
    status = clEnqueueUnmapMemObject(commandQueue,
                                     outBuffer,
                                     (void*)outMapPtr,
                                     0,
                                     NULL,
                                     &outUnmapEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(outputBuffer) failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&outUnmapEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outUnmapEvt) Failed");
    return SDK_SUCCESS;
}
Esempio n. 7
0
int main(int argc, char *argv[])
{
	// selected platform and device number
	cl_uint pn = 0, dn = 0;

	// OpenCL error
	cl_int error;

	// generic iterator
	cl_uint i;

	// major/minor version of the platform OpenCL version
	cl_uint ocl_major, ocl_minor;

	// set platform/device num from command line
	if (argc > 1)
		pn = atoi(argv[1]);
	if (argc > 2)
		dn = atoi(argv[2]);

	error = clGetPlatformIDs(0, NULL, &np);
	CHECK_ERROR("getting amount of platform IDs");
	printf("%u platforms found\n", np);
	if (pn >= np) {
		fprintf(stderr, "there is no platform #%u\n" , pn);
		exit(1);
	}
	// only allocate for IDs up to the intended one
	platform = calloc(pn+1,sizeof(*platform));
	// if allocation failed, next call will bomb. rely on this
	error = clGetPlatformIDs(pn+1, platform, NULL);
	CHECK_ERROR("getting platform IDs");

	// choose platform
	p = platform[pn];

	error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting platform name");
	printf("using platform %u: %s\n", pn, strbuf);

	error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting platform version");
	// we need 1.2 at least
	i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor);
	if (i != 2) {
		fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n",
			__func__, __LINE__);
		exit(1);
	}
	if (ocl_major == 1 && ocl_minor < 2) {
		fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n",
			__func__, __LINE__, strbuf);
		exit(1);
	}

	error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
	CHECK_ERROR("getting amount of device IDs");
	printf("%u devices found\n", nd);
	if (dn >= nd) {
		fprintf(stderr, "there is no device #%u\n", dn);
		exit(1);
	}
	// only allocate for IDs up to the intended one
	device = calloc(dn+1,sizeof(*device));
	// if allocation failed, next call will bomb. rely on this
	error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL);
	CHECK_ERROR("getting device IDs");

	// choose device
	d = device[dn];
	error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting device name");
	printf("using device %u: %s\n", dn, strbuf);

	error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE,
			sizeof(gmem), &gmem, NULL);
	CHECK_ERROR("getting device global memory size");
	error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
			sizeof(alloc_max), &alloc_max, NULL);
	CHECK_ERROR("getting device max memory allocation size");

	// create context
	ctx_prop[1] = (cl_context_properties)p;
	ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error);
	CHECK_ERROR("creating context");

	// create queue
	q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error);
	CHECK_ERROR("creating queue");

	// create program
	pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error);
	CHECK_ERROR("creating program");

	// build program
	error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL);
	CHECK_ERROR("building program");

	// get kernel
	k = clCreateKernel(pg, "add", &error);
	CHECK_ERROR("creating kernel");

	error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
			sizeof(wgm), &wgm, NULL);
	CHECK_ERROR("getting preferred workgroup size multiple");

	// number of elements on which kernel will be launched. it's ok if we don't
	// cover every byte of the buffers
	nels = alloc_max/sizeof(cl_float);

	gws = ROUND_MUL(nels, wgm);

	printf("will use %zu workitems grouped by %zu to process %u elements\n",
			gws, wgm, nels);

	// we will try and allocate at least one buffer more than needed to fill
	// the device memory, and no less than 3 anyway
	nbuf = gmem/alloc_max + 1;
	if (nbuf < 3)
		nbuf = 3;

#define MB (1024*1024.0)

	printf("will try allocating %u host buffers of %gMB each to overcommit %gMB\n",
			nbuf, alloc_max/MB, gmem/MB);

	hostbuf = calloc(nbuf, sizeof(cl_mem));

	if (!hostbuf) {
		fprintf(stderr, "could not prepare support for %u buffers\n", nbuf);
		exit(1);
	}

	// allocate ‘host’ buffers
	for (i = 0; i < nbuf; ++i) {
		hostbuf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, alloc_max,
				NULL, &error);
		CHECK_ERROR("allocating host buffer");
		printf("host buffer %u allocated\n", i);
		error = clEnqueueMigrateMemObjects(q, 1, hostbuf + i,
				CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED,
				0, NULL, NULL);
		CHECK_ERROR("migrating buffer to host");
		printf("buffer %u migrated to host\n", i);
	}

	// allocate ‘device’ buffers
	for (i = 0; i < 2; ++i) {
		devbuf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_max,
				NULL, &error);
		CHECK_ERROR("allocating devbuffer");
		printf("dev buffer %u allocated\n", i);
		if (i == 0) {
			float patt = 0;
			error = clEnqueueFillBuffer(q, devbuf[0], &patt, sizeof(patt),
					0, nels*sizeof(patt), 0, NULL, &mem_evt);
			CHECK_ERROR("enqueueing memset");
		}
	}
	error = clWaitForEvents(1, &mem_evt);
	CHECK_ERROR("waiting for buffer fill");
	clReleaseEvent(mem_evt); mem_evt = NULL;

	// use the buffers
	for (i = 0; i < nbuf; ++i) {
		printf("testing buffer %u\n", i);

		// for each buffer, we do a setup on CPU and then use it as second
		// argument for the kernel
		hbuf = clEnqueueMapBuffer(q, hostbuf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
				0, alloc_max, 0, NULL, NULL, &error);
		CHECK_ERROR("mapping buffer");
		for (e = 0; e < nels; ++e)
			hbuf[e] = i;
		error = clEnqueueUnmapMemObject(q, hostbuf[i], hbuf, 0, NULL, NULL);
		CHECK_ERROR("unmapping buffer");
		hbuf = NULL;

		// copy ‘host’ to ‘device’ buffer
		clEnqueueCopyBuffer(q, hostbuf[i], devbuf[1], 0, 0, alloc_max,
				0, NULL, NULL);
		// make sure all pending actions are completed
		error =	clFinish(q);
		CHECK_ERROR("settling down");

		clSetKernelArg(k, 0, sizeof(cl_mem), devbuf);
		clSetKernelArg(k, 1, sizeof(cl_mem), devbuf + 1);
		clSetKernelArg(k, 2, sizeof(nels), &nels);
		error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm,
				0, NULL, &krn_evt);
		CHECK_ERROR("enqueueing kernel");

		error = clEnqueueCopyBuffer(q, devbuf[0], hostbuf[0],
				0, 0, alloc_max, 1, &krn_evt, &mem_evt);
		CHECK_ERROR("copying data to host");

		expected = i*(i+1)/2.0f;
		hbuf = clEnqueueMapBuffer(q, hostbuf[0], CL_TRUE, CL_MAP_READ,
				0, alloc_max, 1, &mem_evt, NULL, &error);
		CHECK_ERROR("mapping buffer 0");
		for (e = 0; e < nels; ++e)
			if (hbuf[e] != expected) {
				fprintf(stderr, "mismatch @ %u: %g instead of %g\n",
						e, hbuf[e], expected);
				exit(1);
			}
		error = clEnqueueUnmapMemObject(q, hostbuf[0], hbuf, 0, NULL, NULL);
		CHECK_ERROR("unmapping buffer 0");
		hbuf = NULL;
		clReleaseEvent(krn_evt);
		clReleaseEvent(mem_evt);
		krn_evt = mem_evt = NULL;
	}

	for (i = 1; i <= 2; ++i) {
		clReleaseMemObject(devbuf[2 - i]);
		printf("dev buffer %u freed\n", nbuf  - i);
	}
	for (i = 1; i <= nbuf; ++i) {
		clReleaseMemObject(hostbuf[nbuf - i]);
		printf("host buffer %u freed\n", nbuf  - i);
	}

	return 0;
}
Esempio n. 8
0
/* ------- Create and destroy necessary objects ------- */
static void create_clobj(int gws, struct fmt_main * self) {
    self->params.min_keys_per_crypt = self->params.max_keys_per_crypt = gws;

    pinned_saved_keys = clCreateBuffer(context[ocl_gpu_id],
            CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            sizeof(sha512_password) * gws, NULL, &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_saved_keys");

    plaintext = (sha512_password *) clEnqueueMapBuffer(queue[ocl_gpu_id],
            pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0,
            sizeof(sha512_password) * gws, 0, NULL, NULL, &ret_code);
    HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_plain");

    pinned_partial_hashes = clCreateBuffer(context[ocl_gpu_id],
            CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            sizeof(sha512_hash) * gws, NULL, &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_partial_hashes");

    calculated_hash = (sha512_hash *) clEnqueueMapBuffer(queue[ocl_gpu_id],
            pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0,
            sizeof(sha512_hash) * gws, 0, NULL, NULL, &ret_code);
    HANDLE_CLERROR(ret_code, "Error mapping page-locked memory out_hashes");

    // create arguments (buffers)
    salt_buffer = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY,
            sizeof(sha512_salt), NULL, &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating salt_buffer out argument");

    pass_buffer = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY,
            sizeof(sha512_password) * gws, NULL, &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys");

    hash_buffer = clCreateBuffer(context[ocl_gpu_id], CL_MEM_WRITE_ONLY,
            sizeof(sha512_hash) * gws, NULL, &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_out");

    work_buffer = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE,
            sizeof(sha512_buffers) * gws, NULL, &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating buffer argument work_area");

    //Set kernel arguments
    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof(cl_mem),
            (void *) &salt_buffer), "Error setting argument 0");
    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(cl_mem),
            (void *) &pass_buffer), "Error setting argument 1");
    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(cl_mem),
            (void *) &hash_buffer), "Error setting argument 2");

    if (gpu(source_in_use) || use_local(source_in_use)) {
        //Set prepare kernel arguments
        HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 0, sizeof(cl_mem),
            (void *) &salt_buffer), "Error setting argument 0");
        HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 1, sizeof(cl_mem),
            (void *) &pass_buffer), "Error setting argument 1");
        HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 2, sizeof(cl_mem),
            (void *) &work_buffer), "Error setting argument 2");

        //Fast working memory.
        HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 3,
            sizeof(sha512_password) * local_work_size,
            NULL), "Error setting argument 3");

        if (use_local(source_in_use)) {
            HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 4,
                sizeof(sha512_buffers) * local_work_size,
                NULL), "Error setting argument 4");
            HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 5,
                sizeof(sha512_ctx) * local_work_size,
                NULL), "Error setting argument 5");
        }
        //Set crypt kernel arguments
        HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 3, sizeof(cl_mem),
            (void *) &work_buffer), "Error setting argument crypt_kernel (3)");

        if (use_local(source_in_use)) {
            //Fast working memory.
            HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 4,
                sizeof(sha512_buffers) * local_work_size,
                NULL), "Error setting argument 4");
            HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 5,
                sizeof(sha512_ctx) * local_work_size,
                NULL), "Error setting argument 5");
        }
        //Set final kernel arguments
        HANDLE_CLERROR(clSetKernelArg(final_kernel, 0, sizeof(cl_mem),
                (void *) &salt_buffer), "Error setting argument 0");
        HANDLE_CLERROR(clSetKernelArg(final_kernel, 1, sizeof(cl_mem),
                (void *) &pass_buffer), "Error setting argument 1");
        HANDLE_CLERROR(clSetKernelArg(final_kernel, 2, sizeof(cl_mem),
                (void *) &hash_buffer), "Error setting argument 2");
        HANDLE_CLERROR(clSetKernelArg(final_kernel, 3, sizeof(cl_mem),
            (void *) &work_buffer), "Error setting argument crypt_kernel (3)");

        if (use_local(source_in_use)) {
            //Fast working memory.
            HANDLE_CLERROR(clSetKernelArg(final_kernel, 4,
                sizeof(sha512_buffers) * local_work_size,
                NULL), "Error setting argument 4");
            HANDLE_CLERROR(clSetKernelArg(final_kernel, 5,
                sizeof(sha512_ctx) * local_work_size,
                NULL), "Error setting argument 5");
        }
    }
    memset(plaintext, '\0', sizeof(sha512_password) * gws);
    global_work_size = gws;
}
Esempio n. 9
0
void Render(float delta)
{
    clEnqueueNDRangeKernel(   queue,
                           kernel,
                           1,
                           NULL,
                           &global_work_size,
                           NULL, 0, NULL, NULL);
    
    // 7. Look at the results via synchronous buffer map.
    cl_float4 *ptr = (cl_float4 *) clEnqueueMapBuffer( queue,
                                                      buffer,
                                                      CL_TRUE,
                                                      CL_MAP_READ,
                                                      0,
                                                      kWidth * kHeight * sizeof(cl_float4),
                                                      0, NULL, NULL, NULL );
    
    cl_float *viewTransformPtr = (cl_float *) clEnqueueMapBuffer( queue,
                                                                 viewTransform,
                                                                 CL_TRUE,
                                                                 CL_MAP_WRITE,
                                                                 0,
                                                                 16 * sizeof(cl_float),
                                                                 0, NULL, NULL, NULL );
    
    cl_float *worldTransformsPtr = (cl_float *) clEnqueueMapBuffer( queue,
                                                                   worldTransforms,
                                                                   CL_TRUE,
                                                                   CL_MAP_WRITE,
                                                                   0,
                                                                   16 * sizeof(cl_float)*2,
                                                                   0, NULL, NULL, NULL );
    
    
    memcpy(viewTransformPtr, viewMatrix, sizeof(float)*16);
    memcpy(worldTransformsPtr, sphereTransforms[0], sizeof(float)*16);
    memcpy(worldTransformsPtr+16, sphereTransforms[1], sizeof(float)*16);
    
    
    clEnqueueUnmapMemObject(queue, viewTransform, viewTransformPtr, 0, 0, 0);
    clEnqueueUnmapMemObject(queue, worldTransforms, worldTransformsPtr, 0, 0, 0);
    
    unsigned char* pixels = new unsigned char[kWidth*kHeight*4];
    for(int i=0; i <  kWidth * kHeight; i++){
        pixels[i*4] = ptr[i].s[0]*255;
        pixels[i*4+1] = ptr[i].s[1]*255;
        pixels[i*4+2] = ptr[i].s[2]*255;
        pixels[i*4+3] = 1;
    }
    
    glBindTexture(GL_TEXTURE_2D, 1);
    glTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR );
    glTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR );
    glTexImage2D(GL_TEXTURE_2D, 0, 4, kWidth, kHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, pixels);
    delete [] pixels;
    
    glClearColor(1,1,1,1);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    glOrtho(-1,1,1,-1,1,100);
    glMatrixMode(GL_MODELVIEW);
	   
    glLoadIdentity();
    glBegin(GL_QUADS);
    glTexCoord2f(0,1);
    glVertex3f(-1,-1,-1);
    glTexCoord2f(0,0);
    glVertex3f(-1,1,-1);
    glTexCoord2f(1,0);
    glVertex3f(1,1,-1);
    glTexCoord2f(1,1);
    glVertex3f(1,-1,-1);
    glEnd();
    
    clFinish( queue );
    SDL_GL_SwapWindow(window);
}
Esempio n. 10
0
int main(int argc, char **argv){
	
	printf("Check OpenCL environtment\n");

	cl_platform_id platid;
	cl_device_id devid;
	cl_int res;
	size_t param;
	
	/* Query OpenCL, get some information about the returned device */
	clGetPlatformIDs(1u, &platid, NULL);
	clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL);

	cl_char vendor_name[1024] = {0};
	cl_char device_name[1024] = {0};
	clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL);
	clGetDeviceInfo(devid, CL_DEVICE_NAME,   sizeof(device_name), device_name, NULL);
	printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name);
	
	clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &param, NULL);
	printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param);
	
	clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param);

	clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param);

	/* Check if kernel source exists, we compile argv[1] passed kernel */
	if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); }

	char *kernel_source;
	if(load_program_source(argv[1], &kernel_source)) return 1;
	
	printf("Building from OpenCL source: \t%s\n", argv[1]);
	printf("Compile/query OpenCL_program:\t%s\n", argv[2]);
	
	/* Create context and kernel program */
	cl_context context = 	clCreateContext(0, 1, &devid, NULL, NULL, NULL);
	cl_program pro = 	clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL);
	res = 			clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL);

	if(res != CL_SUCCESS){
		printf("clBuildProgram failed: %d\n", res); char buf[0x10000];
		clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL);
		printf("\n%s\n", buf); return(-1); }

	cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); 	check_return(res);
	
	/* Get the maximum work-group size for executing the kernel on the device */
	size_t global, local;
	res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL);		check_return(res);
	printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local);
	
	res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param, NULL);	check_return(res);
	printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param);
	
	cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL);
	if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); }

	local = 4;
	int n = 2 * local;	//num_group * local workgroup size 
	global = n;
	
	int	num_groups=		global / local,
		allocated_local=	sizeof(data) * local + 
					sizeof(debug) * local;

	data *DP __attribute__ ((aligned(16)));
	DP = calloc(n, sizeof(data) *1);

	debug *dbg __attribute__ ((aligned(16)));
	dbg = calloc(n, sizeof(debug));
	
	printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups);
	printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256));
	printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local);

	cl_mem	cl_DP, cl_EC, cl_INV, DEBUG;
	cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res);					check_return(res);				
	cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY,  1 * sizeof(Elliptic_Curve), NULL, &res);	check_return(res);	//_constant address space
	cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY,  1 * sizeof(u8) * 0x80, NULL, &res);		check_return(res);
	DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res);		check_return(res);
	
	Elliptic_Curve EC;
	/*	
		Curve domain parameters, (test vectors)
		-------------------------------------------------------------------------------------
		p:	c1c627e1638fdc8e24299bb041e4e23af4bb5427		is prime
		a:	c1c627e1638fdc8e24299bb041e4e23af4bb5424		divisor g = 62980
		b:	877a6d84155a1de374b72d9f9d93b36bb563b2ab		divisor g = 227169643
		Gx: 	010aff82b3ac72569ae645af3b527be133442131		divisor g = 32209245
		Gy: 	46b8ec1e6d71e5ecb549614887d57a287df573cc		divisor g = 972	
		precomputed_per_curve_constants:
		U:	c1c627e1638fdc8e24299bb041e4e23af4bb5425
		V:	3e39d81e9c702371dbd6644fbe1b1dc50b44abd9
		
		already prepared mod p to test:
		a:      07189f858e3f723890a66ec1079388ebd2ed509c
		b:      6043379beb0dade6eed1e9d6de64f4a0c50639d4
		gx:     5ef84aacf4f0ea6752f572d0741f40049f354dca
		gy:     418c695435af6b3d4d7cbb72967395016ef67239
		resulting point:
		P.x:    01718f862ebe9423bd661a65355aa1c86ba330f8		program MUST got this point !!
		P.y:    557e8ed53ffbfe2c990a121967b340f62e0e4fe2
		taken mod p:
		P.x:    41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3		
		P.y:    73ca143c9badedf2d9d3c7573307115ccfe04f13
	*/	
	u8 *t;
	t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427");	memcpy(EC.p, t, 20);
	t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c");	memcpy(EC.a, t, 20);
	t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4");	memcpy(EC.b, t, 20);
	t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca");	memcpy(EC.Gx, t, 20);
	t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239");	memcpy(EC.Gy, t, 20);
	
	t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425");	memcpy(EC.U, t, 20);
	t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9");	memcpy(EC.V, t, 20);

	/* we need to map buffer now to load some k into data */
	DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data),  0, NULL, NULL, &res);	check_return(res);

	t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710");
	for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21);
	
	free(t);
//d	for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1);

	/* we can alter just a byte into a chosen k to verify that we'll get a different point! */
	//DP[2].k[2] = 0x09;
	
//no	res = clEnqueueWriteBuffer(cmd_queue, cl_DP,  CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL);	check_return(res);

	res = clEnqueueWriteBuffer(cmd_queue, cl_EC,  CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL);	check_return(res);
	res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL);	check_return(res);

	res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP);		/* i/o buffer */
	res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL);	//allocate space for __local in kernel (just this!) one * localsize
	res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC);
	res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV);	
	res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL);	//allocate space for __local in kernel (just this!) one * localsize
	res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG);		//this used to debug kernel output
	check_return(res);

//	printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local);	
	
	cl_event NDRangeEvent;
	cl_ulong start, end;
	
	/* Execute NDrange */	
	res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent);		check_return(res);
//	res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent);		check_return(res);
	
	printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data));

	DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data),  0, NULL, NULL, &res);	check_return(res);
	dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res);	check_return(res);
	
	/* using clEnqueueReadBuffer template */
//	res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL);			check_return(res);
		
	clFlush(cmd_queue);
	clFinish(cmd_queue);

	/* get NDRange execution time with internal ocl profiler */
	res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &end,   NULL);
	check_return(res);
	printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000));			//relative to NDRange call
	printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start)));

	
	printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n");
	for(int i = 0; i < n; i++) {		
//		if(i %local == 0) {
			printf("%d \t", i);
			//printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7));
			
			/* silence this doubled debug info
			printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", 
				dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3],
				dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]);
			*/	
			//printf("%d %d\n", P[i].dig, P[i].c);
			bn_print("", DP[i].k, 21, 1);
			bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1);
			
			printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", 
				DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3],
				DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]);
//		}
	}
	
	/* Release OpenCL stuff, free the rest */
	clReleaseMemObject(cl_DP);
	clReleaseMemObject(cl_EC);
	clReleaseMemObject(cl_INV);
	clReleaseMemObject(DEBUG);
	clReleaseKernel(kernelobj);
	clReleaseProgram(pro);
	clReleaseCommandQueue(cmd_queue);
	clReleaseContext(context);
	
	free(kernel_source);
	
	puts("Done!");
	return 0;
}
Esempio n. 11
0
int main(int argc, const char** argv) {
	cl_uint platform_count;
	cl_platform_id platforms[5];

	cl_int err = CL_SUCCESS;
	unsigned int i, p;

	cl_device_type dev_type = CL_DEVICE_TYPE_ALL;

	void * ptrs[BLOCKS];
	cl_command_queue cqs[BLOCKS];
	cl_mem d_A[BLOCKS];
	cl_mem d_C[BLOCKS];
	cl_mem d_B[BLOCKS];

	cl_event GPUDone[BLOCKS];
	cl_event GPUExecution[BLOCKS];
	struct timeval start, end;

	int workOffset[BLOCKS];
	int workSize[BLOCKS];

	unsigned int sizePerGPU = HC / BLOCKS;
	unsigned int sizeMod = HC % BLOCKS;

	size_t A_size = WA * HA;
	size_t A_mem_size = sizeof(TYPE) * A_size;
	TYPE* A_data;

	size_t B_size = WB * HB;
	size_t B_mem_size = sizeof(TYPE) * B_size;
	TYPE* B_data;

	size_t C_size = WC * HC;
	size_t C_mem_size = sizeof(TYPE) * C_size;
	TYPE* C_data;

	parse_args(argc, argv);

	check(clGetPlatformIDs(5, platforms, &platform_count));
	if (platform_count == 0) {
		printf("No platform found\n");
		exit(77);
	}

	cl_uint device_count;
	cl_uint devs[platform_count];
	cl_device_id * devices[platform_count];
	cl_context ctx[platform_count];
	cl_command_queue * commandQueue[platform_count];

	device_count = 0;
	for (p=0; p<platform_count; p++) {
		cl_platform_id platform = platforms[p];

		err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
		if (err == CL_DEVICE_NOT_FOUND) {
			devs[p] = 0;
			continue;
		}
		if (devs[p] == 0) {
		     printf("No OpenCL device found\n");
		     exit(77);
		}
		if (err != CL_SUCCESS) {
			fprintf(stderr, "OpenCL Error (%d) in clGetDeviceIDs()\n", err);
			exit(EXIT_FAILURE);
		}
		if (devs[p] == 0)
			continue;

		devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
		commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);

		check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));

		cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
		check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));

		for(i = 0; i < devs[p]; ++i)
		{
			cl_device_id device = devices[p][i];
			char name[2048];
			name[0] = '\0';
			clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
			printf("Device %d: %s\n", i, name);

			check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
		}

		device_count += devs[p];
	}

	if (device_count == 0)
		error("No device found\n");



	cl_kernel multiplicationKernel[platform_count];

	printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n",
			(unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);

	// allocate host memory for matrices A, B and C
	A_data = (TYPE*)malloc(A_mem_size);
	if (A_data == NULL) {
		perror("malloc");
	}

	B_data = (TYPE*)malloc(B_mem_size);
	if (B_data == NULL) {
		perror("malloc");
	}

	C_data = (TYPE*) malloc(C_mem_size);
	if (C_data == NULL) {
		perror("malloc");
	}

	cl_program program[platform_count];

	for (p=0; p<platform_count; p++) {
		if (devs[p] == 0)
			continue;

		check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));

		check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));

		check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
	}

	printf("Initializing data...\n");
	srand(2008);
	fillArray(A_data, A_size);
	fillArray(B_data, B_size);
	memset(C_data, 0, C_size);


	printf("Computing...\n");
	workOffset[0] = 0;
	gettimeofday(&start, NULL);

	size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
	int c = 0;
	for (p=0; p<platform_count;p++) {
		for (i=0; i<devs[p]; i++) {
			check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
			c++;
		}
	}

	for(i=0; i < BLOCKS; ++i)
	{
		int d = i % device_count;
		cl_uint platform = 0;

		// determine device platform
		int dev = d;
		for (platform = 0; platform < platform_count; platform++) {
			if ((cl_int)(dev - devs[platform]) < 0)
				break;
			dev -= devs[platform];
		}

		workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;

		check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
		check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));

		check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d]));
		check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i]));

		size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])};

		check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));

		// Non-blocking copy of result from device to host
		cqs[i] = commandQueue[platform][dev];
		check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));

		if(i+1 < BLOCKS)
			workOffset[i + 1] = workOffset[i] + workSize[i];
	}


	// CPU sync with GPU
	for (p=0; p<platform_count;p++) {
		cl_uint dev;
		for (dev=0; dev<devs[p]; dev++) {
			clFinish(commandQueue[p][dev]);
		}
	}

	gettimeofday(&end, NULL);
	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));

	double dSeconds = timing/1000/1000;
	double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
	double gflops = 1.0e-9 * dNumOps/dSeconds;

	printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",
			gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);

	// compute reference solution
	if (check) {
		printf("Comparing results with CPU computation... ");
		TYPE* reference = (TYPE*)malloc(C_mem_size);
		computeReference(reference, A_data, B_data, HA, WA, WB);

		// check result
		int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
		if (res == 0) {
			printf("\n\n");
			printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
		}
		else printf("PASSED\n\n");
		free(reference);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clFinish(cqs[i]);
	}

	for (i=0; i<device_count; i++) {
		clReleaseMemObject(d_B[i]);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clReleaseMemObject(d_A[i]);
		clReleaseMemObject(d_C[i]);
		clReleaseEvent(GPUExecution[i]);
		clReleaseEvent(GPUDone[i]);
	}


	for (p=0; p<platform_count;p++) {
		if (devs[p] == 0)
			continue;

		check(clReleaseKernel(multiplicationKernel[p]));
		check(clReleaseProgram(program[p]));
		check(clReleaseContext(ctx[p]));
		cl_uint k;
		for(k = 0; k < devs[p]; ++k)
		{
			check(clReleaseCommandQueue(commandQueue[p][k]));
		}
	}

	free(A_data);
	free(B_data);
	free(C_data);

	return 0;
}
///////////////////////////////////////////////////////////////////////////////
//  test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
double testHostToDeviceTransfer(unsigned int memSize, accessMode accMode, memoryMode memMode)
{
    double elapsedTimeInSec = 0.0;
    double bandwidthInMBs = 0.0;
    unsigned char* h_data = NULL;
    cl_mem cmPinnedData = NULL;
    cl_mem cmDevData = NULL;
    cl_int ciErrNum = CL_SUCCESS;

    // Allocate and init host memory, pinned or conventional
    if(memMode == PINNED)
   { 
        // Create a host buffer
        cmPinnedData = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, memSize, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        // Get a mapped pointer
        h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_WRITE, 0, memSize, 0, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //initialize 
        for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
        {
            h_data[i] = (unsigned char)(i & 0xff);
        }
	
        // unmap and make data in the host buffer valid
        ciErrNum = clEnqueueUnmapMemObject(cqCommandQueue, cmPinnedData, (void*)h_data, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);
		h_data = NULL;  // buffer is unmapped
    }
    else 
    {
        // standard host alloc
        h_data = (unsigned char *)malloc(memSize);

        //initialize 
        for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
        {
            h_data[i] = (unsigned char)(i & 0xff);
        }
    }

    // allocate device memory 
    cmDevData = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Sync queue to host, start timer 0, and copy data from Host to GPU
    clFinish(cqCommandQueue);
    shrDeltaT(0);
    if(accMode == DIRECT)
    { 
	    if(memMode == PINNED) 
        {
            // Get a mapped pointer
            h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_READ, 0, memSize, 0, NULL, NULL, &ciErrNum);
            oclCheckError(ciErrNum, CL_SUCCESS);
	    }

        // DIRECT:  API access to device buffer 
        for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
        {
                ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevData, CL_FALSE, 0, memSize, h_data, 0, NULL, NULL);
                oclCheckError(ciErrNum, CL_SUCCESS);
        }
        ciErrNum = clFinish(cqCommandQueue);
        oclCheckError(ciErrNum, CL_SUCCESS);
    } 
    else 
    {
        // MAPPED: mapped pointers to device buffer and conventional pointer access
        void* dm_idata = clEnqueueMapBuffer(cqCommandQueue, cmDevData, CL_TRUE, CL_MAP_WRITE, 0, memSize, 0, NULL, NULL, &ciErrNum);
		oclCheckError(ciErrNum, CL_SUCCESS);
		if(memMode == PINNED ) 
		{
			h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_READ, 0, memSize, 0, NULL, NULL, &ciErrNum); 
            oclCheckError(ciErrNum, CL_SUCCESS); 
        } 
        for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
        {
            memcpy(dm_idata, h_data, memSize);
        }
        ciErrNum = clEnqueueUnmapMemObject(cqCommandQueue, cmDevData, dm_idata, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);
    }
    
    //get the the elapsed time in seconds
    elapsedTimeInSec = shrDeltaT(0);
    
    //calculate bandwidth in MB/s
    bandwidthInMBs = ((double)memSize * (double)MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (double)(1 << 20));

    //clean up memory
    if(cmDevData)clReleaseMemObject(cmDevData);
    if(cmPinnedData) 
    {
	    clEnqueueUnmapMemObject(cqCommandQueue, cmPinnedData, (void*)h_data, 0, NULL, NULL);
	    clReleaseMemObject(cmPinnedData);
    }
    h_data = NULL;

    return bandwidthInMBs;
}
Esempio n. 13
0
magma_err_t
magma_cgeqrf2_gpu( magma_int_t m, magma_int_t n,
                   magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda,
                   magmaFloatComplex *tau, magma_err_t *info,
                   magma_queue_t* queue)
{
/*  -- clMAGMA (version 1.1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2014

    Purpose
    =======
    CGEQRF computes a QR factorization of a complex M-by-N matrix A:
    A = Q * R.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    dA      (input/output) COMPLEX array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix dA.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    TAU     (output) COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -9, internal GPU memory allocation failed.

    Further Details
    ===============

    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define dA(a_1,a_2)    dA, (dA_offset + (a_1) + (a_2)*(ldda))
    #define work_ref(a_1)  work, (a_1)
    #define work_href(a_1) ( work + (a_1))
    #define hwork          ( work + (nb)*(m))
    #define hhwork         work, ((nb)*(m))  

    magmaFloatComplex_ptr dwork;
    magmaFloatComplex  *work;

    magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows;
    magma_int_t nbmin, nx, ib, nb;
    magma_int_t lhwork, lwork;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = min(m,n);
    if (k == 0)
        return MAGMA_SUCCESS;

    nb = magma_get_cgeqrf_nb(m);

    lwork  = (m+n) * nb;
    lhwork = lwork - (m)*nb;

    
    if ( MAGMA_SUCCESS != magma_cmalloc( &dwork, n*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /*    
    if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, lwork ) ) {
        *info = MAGMA_ERR_HOST_ALLOC;
        magma_free( dwork );
        return *info;
    }
    */

    cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 
                                   sizeof(magmaFloatComplex)*lwork, NULL, NULL);
    work = (magmaFloatComplex*)clEnqueueMapBuffer(queue[0], buffer, CL_TRUE, 
                                                   CL_MAP_READ | CL_MAP_WRITE, 
                                                   0, lwork*sizeof(magmaFloatComplex), 
                                                   0, NULL, NULL, NULL);


    nbmin = 2;
    nx    = nb;
    ldwork = m;
    lddwork= n;

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            
            magma_queue_sync( queue[1] );
            chk(magma_cgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), ldwork, queue[0], NULL));
          
            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i         ), ldda, dwork,0,      lddwork,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queue[1]);

                chk(magma_csetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork,
                                            dA(old_i, old_i), ldda, queue[1], NULL));
            }

            magma_queue_sync(queue[0]);
            lapackf77_cgeqrf(&rows, &ib, work_href(i), &ldwork, tau+i, hwork, &lhwork, info);
   
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work_href(i), &ldwork, tau+i, hwork, &ib);

            cpanel_to_q( MagmaUpper, ib, work_href(i), ldwork, hwork+ib*ib );

            /* download the i-th V matrix */
            chk(magma_csetmatrix_async(rows, ib, work_ref(i), ldwork, dA(i,i), ldda, queue[0], NULL));

            /* download the T matrix */
            magma_queue_sync( queue[1] );
            chk(magma_csetmatrix_async( ib, ib, hhwork, ib, dwork, 0, lddwork, queue[0], NULL));
            magma_queue_sync( queue[0] );

            if (i + ib < n)
              {
                
                if (i+nb < k-nx) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dwork,0,  lddwork,
                                      dA(i, i+ib), ldda, dwork,ib, lddwork, queue[1]);
                    cq_to_panel( MagmaUpper, ib, work_href(i), ldwork, hwork+ib*ib );
                }
                else {
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dwork,0,  lddwork,
                                      dA(i, i+ib), ldda, dwork,ib, lddwork, queue[1]);
                    cq_to_panel( MagmaUpper, ib, work_href(i), ldwork, hwork+ib*ib );
                    chk(magma_csetmatrix_async(ib, ib, work_ref(i), ldwork, dA(i,i), ldda, queue[1], NULL));
                }
                old_i  = i;
                old_ib = ib;
              }
        }
    } else {
        i = 0;
    }

    magma_free(dwork);

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_cgetmatrix_async(rows, ib, dA(i, i), ldda, work, 0, rows, queue[1], NULL);
        magma_queue_sync(queue[1]);
        
        lhwork = lwork - rows*ib;
        lapackf77_cgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
        magma_csetmatrix_async(rows, ib, work, 0, rows, dA(i, i), ldda, queue[1], NULL);
    }

    magma_queue_sync(queue[0]);
    magma_queue_sync(queue[1]);

    // magma_free_cpu(work);
    clEnqueueUnmapMemObject(queue[0], buffer, work, 0, NULL, NULL);
    clReleaseMemObject(buffer);

    return *info;
} /* magma_cgeqrf2_gpu */
Esempio n. 14
0
int main(void) {
  cl_context context = 0;
  cl_command_queue command_waiting_line = 0;
  cl_program program = 0;
  cl_device_id device_id = 0;
  cl_kernel kernel = 0;
  // int numberOfMemoryObjects = 3;
  cl_mem memoryObjects[3] = {0, 0, 0};
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_devices;
  cl_int errorNumber;
  cl_int ret;
  /* Load the source code containing the kernel*/
  char fileName[] = "source/parallel/composition_population.cl";
  FILE *fp;
  char *source_str;
  size_t source_size;
  fp = fopen(fileName, "r");
  cl_uint ret_num_platforms;
  if (!fp) {
    fprintf(stderr, "Failed to load kernel %s:%d.\n", __FILE__, __LINE__);
    exit(1);
  }
  source_str = (char *)malloc(MAX_SOURCE_SIZE);
  source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  fclose(fp);

  // printf("file: %s :file", source_str);

  getInfo();

  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to get platform id's. %s:%d\n", __FILE__, __LINE__);
    return 1;
  }
  ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
                       &ret_num_devices);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to get OpenCL devices. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create an OpenCL context. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

#ifdef CL_VERSION_2_0
  command_waiting_line =
      clCreateCommandQueueWithProperties(context, device_id, 0, &ret);
#else
  command_waiting_line = clCreateCommandQueue(context, device_id, 0, &ret);
#endif

  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create the OpenCL command queue. %s:%d\n",
            __FILE__, __LINE__);
    return 1;
  }

  /* create program */

  program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
                                      (const size_t *)&source_size, &ret);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create OpenCL program. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }
  /* Build Kernel Program */
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to build OpenCL program. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  kernel = clCreateKernel(program, "composition_population", &errorNumber);
  if (!success_verification(errorNumber)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create OpenCL kernel. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  /* [Setup memory] */
  /* Number of elements in the arrays of input and output data. */

  /* The buffers are the size of the arrays. */
  uint16_t activity_atom_size = MAX_INDEPENDENTCLAUSE_TABLET * 1;
  uint8_t program_size = 1;
  uint8_t population_size = 4;
  size_t activity_atom_byte_size = activity_atom_size * sizeof(v16us);
  uint16_t population_byte_size =
      (uint16_t)(program_size * (uint16_t)(population_size * sizeof(v16us)));

  /*
   * Ask the OpenCL implementation to allocate buffers for the data.
   * We ask the OpenCL implemenation to allocate memory rather than allocating
   * it on the CPU to avoid having to copy the data later.
   * The read/write flags relate to accesses to the memory from within the
   * kernel.
   */
  int createMemoryObjectsSuccess = TRUE;

  memoryObjects[0] =
      clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                     activity_atom_byte_size, NULL, &errorNumber);
  createMemoryObjectsSuccess &= success_verification(errorNumber);

  memoryObjects[1] =
      clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                     population_byte_size, NULL, &errorNumber);
  createMemoryObjectsSuccess &= success_verification(errorNumber);

  memoryObjects[2] =
      clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                     newspaper_byte_size, NULL, &errorNumber);
  createMemoryObjectsSuccess &= success_verification(errorNumber);

  if (!createMemoryObjectsSuccess) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create OpenCL buffer. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }
  /* [Setup memory] */

  /* [Map the buffers to pointers] */
  /* Map the memory buffers created by the OpenCL implementation to pointers so
   * we can access them on the CPU. */
  int mapMemoryObjectsSuccess = TRUE;

  v16us *activity_atom = (v16us *)clEnqueueMapBuffer(
      command_waiting_line, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0,
      activity_atom_byte_size, 0, NULL, NULL, &errorNumber);
  mapMemoryObjectsSuccess &= success_verification(errorNumber);

  // cl_int *inputB = (cl_int *)clEnqueueMapBuffer(
  //    command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0,
  //    bufferSize, 0,
  //    NULL, NULL, &errorNumber);
  // mapMemoryObjectsSuccess &= success_verification(errorNumber);

  if (!mapMemoryObjectsSuccess) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__);
    return 1;
  }
  /* [Map the buffers to pointers] */

  /* [Initialize the input data] */

  const char *activity_atom_text = "nyistu htoftu hnattu hnamtu";
  const uint16_t activity_atom_text_size =
      (uint16_t)(strlen(activity_atom_text));
  const char *quiz_independentClause_list_text =
      "zrundoka hwindocayu hwindokali"
      "hwindoka tyutdocayu tyindokali"
      "tyutdoka tyutdocayu hfutdokali"
      "tyindoka fwandocayu nyatdokali";
  //"bu.hnac.2.hnac.buka bu.hnac.2.hnac.buca yu "
  //"bu.hnac.4.hnac.bukali";
  const uint16_t quiz_independentClause_list_text_size =
      (uint16_t)strlen(quiz_independentClause_list_text);
  uint16_t quiz_independentClause_list_size = 4;
  v16us quiz_independentClause_list[8];
  uint16_t text_remainder = 0;
  // uint16_t program_worth = 0;
  uint64_t random_seed = 0x0123456789ABCDEF;
  uint16_t tablet_indexFinger = 0;
  // uint8_t champion = 0;
  // uint16_t champion_worth = 0;
  // v16us program_;
  // v16us population[4];
  memset(quiz_independentClause_list, 0,
         (size_t)(quiz_independentClause_list_size * TABLET_LONG * WORD_THICK));
  text_code(activity_atom_text_size, activity_atom_text, &activity_atom_size,
            activity_atom, &text_remainder);
  assert(text_remainder == 0);
  text_code(quiz_independentClause_list_text_size,
            quiz_independentClause_list_text, &quiz_independentClause_list_size,
            quiz_independentClause_list, &text_remainder);
  /* [Initialize the input data] */

  /* [Un-map the buffers] */
  /*
   * Unmap the memory objects as we have finished using them from the CPU side.
   * We unmap the memory because otherwise:
   * - reads and writes to that memory from inside a kernel on the OpenCL side
   * are undefined.
   * - the OpenCL implementation cannot free the memory when it is finished.
   */
  if (!success_verification(
          clEnqueueUnmapMemObject(command_waiting_line, memoryObjects[0],
                                  activity_atom, 0, NULL, NULL))) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  // if (!success_verification(clEnqueueUnmapMemObject(command_waiting_line,
  // memoryObjects[1],
  //                                          inputB, 0, NULL, NULL))) {
  //  cleanUpOpenCL(context, command_waiting_line, program, kernel,
  //  memoryObjects,
  //                numberOfMemoryObjects);
  //  cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__
  //       << endl;
  //  return 1;
  //}
  /* [Un-map the buffers] */

  /* [Set the kernel arguments] */
  int setKernelArgumentsSuccess = TRUE;
  printf("arg0\n");
  setKernelArgumentsSuccess &= success_verification(clSetKernelArg(
      kernel, 0, sizeof(uint8_t), (uint8_t *)&activity_atom_size));
  printf("arg1\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[0]));
  printf("arg2\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 2, sizeof(uint16_t), (uint16_t *)&program_size));
  printf("arg3\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 3, sizeof(uint8_t), (uint8_t *)&population_size));
  printf("arg4\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 4, sizeof(uint64_t), (uint64_t *)&random_seed));
  printf("arg5\n");
  setKernelArgumentsSuccess &=
      success_verification(clSetKernelArg(kernel, 5, sizeof(uint64_t *), NULL));
  printf("arg6\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 6, sizeof(cl_mem), &memoryObjects[1]));
  printf("arg7\n");
  setKernelArgumentsSuccess &=
      success_verification(clSetKernelArg(kernel, 7, sizeof(uint8_t *), NULL));
  printf("arg8\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 8, sizeof(cl_mem), &memoryObjects[2]));

  if (!setKernelArgumentsSuccess) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed setting OpenCL kernel arguments. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }
  /* [Set the kernel arguments] */

  /* An event to associate with the Kernel. Allows us to retrieve profiling
   * information later. */
  cl_event event = 0;

  /* [Global work size] */
  /*
   * Each instance of our OpenCL kernel operates on a single element of each
   * array so the number of
   * instances needed is the number of elements in the array.
   */
  size_t globalWorksize[1] = {population_size};
  size_t localWorksize[1] = {2};
  /* Enqueue the kernel */
  if (!success_verification(clEnqueueNDRangeKernel(
          command_waiting_line, kernel, 1, NULL, globalWorksize, localWorksize,
          0, NULL, &event))) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed enqueuing the kernel. %s:%d\n", __FILE__, __LINE__);
    return 1;
  }
  /* [Global work size] */

  /* Wait for kernel execution completion. */
  if (!success_verification(clFinish(command_waiting_line))) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed waiting for kernel execution to finish. %s:%d\n",
            __FILE__, __LINE__);
    return 1;
  }

  /* Print the profiling information for the event. */
  // printProfilingInfo(event);
  /* Release the event object. */
  if (!success_verification(clReleaseEvent(event))) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed releasing the event object. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  /* Get a pointer to the output data. */
  printf("clOut\n");
  v16us *output = (v16us *)clEnqueueMapBuffer(
      command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_READ, 0,
      population_byte_size, 0, NULL, NULL, &errorNumber);
  v16us *newspaper = (v16us *)clEnqueueMapBuffer(
      command_waiting_line, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0,
      newspaper_byte_size, 0, NULL, NULL, &errorNumber);
  if (!success_verification(errorNumber)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__);
    return 1;
  }

  /* [Output the results] */
  /* Uncomment the following block to print results. */
  for (tablet_indexFinger = 0;
       tablet_indexFinger < (population_size * TABLET_LONG);
       ++tablet_indexFinger) {
    if (tablet_indexFinger % 0x10 == 0)
      printf("\n");
    printf("%04X ", (uint)((uint16_t *)output)[tablet_indexFinger]);
  }
  printf("\n");
  // printf("program %04X \n", (uint)*((uint16_t *)&(output[1])));

  printf("newspaper \n");
  for (tablet_indexFinger = 0;
       tablet_indexFinger < (NEWSPAPER_LONG * TABLET_LONG);
       ++tablet_indexFinger) {
    if (tablet_indexFinger % 0x10 == 0)
      printf("\n");
    printf("%04X ", (uint)((uint16_t *)newspaper)[tablet_indexFinger]);
  }
  printf("\n");
  /* [Output the results] */

  /* Unmap the memory object as we are finished using them from the CPU side. */
  if (!success_verification(clEnqueueUnmapMemObject(
          command_waiting_line, memoryObjects[1], output, 0, NULL, NULL))) {
    printf("unmapping\n");
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }
  if (!success_verification(clEnqueueUnmapMemObject(
          command_waiting_line, memoryObjects[2], newspaper, 0, NULL, NULL))) {
    printf("unmapping\n");
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  printf("releasing\n");
  /* Release OpenCL objects. */
  // cleanUpOpenCL(context, command_waiting_line, program, kernel,
  // memoryObjects,
  //              numberOfMemoryObjects);
}
int main(int argc, char** argv) {

  printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);

	cl_int error;
	cl_uint num_platforms;
	
	// Get the number of platforms
	error = clGetPlatformIDs(0, NULL, &num_platforms);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Get the list of platforms
	cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(num_platforms, platforms, NULL);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Print the chosen platform (if there are multiple platforms, choose the first one)
	cl_platform_id platform = platforms[0];
	char pbuf[100];
	error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);
	
	// Create a GPU context
	cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0};
    context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Get and print the chosen device (if there are multiple devices, choose the first one)
	size_t devices_size;
	error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	device = devices[0];
	error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);
	
	// Create a command queue
	command_queue = clCreateCommandQueue(context, device, 0, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	

    int size;
    int grid_rows,grid_cols = 0;
    float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; 
    char *tfile, *pfile, *ofile;
    
    int total_iterations = 60;
    int pyramid_height = 1; // number of iterations
	
	if (argc < 7)
		usage(argc, argv);
	if((grid_rows = atoi(argv[1]))<=0||
	   (grid_cols = atoi(argv[1]))<=0||
       (pyramid_height = atoi(argv[2]))<=0||
       (total_iterations = atoi(argv[3]))<=0)
		usage(argc, argv);
		
	tfile=argv[4];
    pfile=argv[5];
    ofile=argv[6];
	
    size=grid_rows*grid_cols;

    // --------------- pyramid parameters --------------- 
    int borderCols = (pyramid_height)*EXPAND_RATE/2;
    int borderRows = (pyramid_height)*EXPAND_RATE/2;
    int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
    int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
    int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1);
    int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1);

    FilesavingTemp = (float *) malloc(size*sizeof(float));
    FilesavingPower = (float *) malloc(size*sizeof(float));
    // MatrixOut = (float *) calloc (size, sizeof(float));

    if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut)
        fatal("unable to allocate memory");
	
	// Read input data from disk
    readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
    readinput(FilesavingPower, grid_rows, grid_cols, pfile);
	
	// Load kernel source from file
	const char *source = load_kernel_source("hotspot_kernel.cl");
	size_t sourceSize = strlen(source);
	
	// Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	char clOptions[110];
	//  sprintf(clOptions,"-I../../src"); 
	sprintf(clOptions," ");
#ifdef BLOCK_SIZE
	sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE);
#endif

    // Create an executable from the kernel
	error = clBuildProgram(program, 1, &device, clOptions, NULL, NULL);
	// Show compiler warnings/errors
	static char log[65536]; memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
	if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
    kernel = clCreateKernel(program, "hotspot", &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
		
	long long start_time = get_time();
	
	// Create two temperature matrices and copy the temperature input data
	cl_mem MatrixTemp[2];
	// Create input memory buffers on device
	MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
    
    // Lingjie Zhang modifited at Nov 1, 2015
    //MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error);
    MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE , sizeof(float) * size, NULL, &error);
    // end Lingjie Zhang modification
    
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Copy the power input data
	cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Perform the computation
	int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height,
								blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower);
	
	// Copy final temperature data back
	cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	long long end_time = get_time();	
	printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000));
	
	// Write final output to output file
    writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
    
	error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	clReleaseMemObject(MatrixTemp[0]);
	clReleaseMemObject(MatrixTemp[1]);
	clReleaseMemObject(MatrixPower);
	
        clReleaseContext(context);

	return 0;
}
Esempio n. 16
0
int main() {

   /* OpenCL data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel;
   cl_int i, j, err;

   /* Data and buffers */
   float data_one[100], data_two[100], result_array[100];
   cl_mem buffer_one, buffer_two;
   void* mapped_memory;

   /* Initialize arrays */
   for(i=0; i<100; i++) {
      data_one[i] = 1.0f*i;
      data_two[i] = -1.0f*i;
      result_array[i] = 0.0f;
   }

   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Build the program and create the kernel */
   program = build_program(context, device, PROGRAM_FILE);
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);   
   };

   /* Create buffers */
   buffer_one = clCreateBuffer(context, CL_MEM_READ_WRITE | 
         CL_MEM_COPY_HOST_PTR, sizeof(data_one), data_one, &err);
   if(err < 0) {
      perror("Couldn't create a buffer object");
      exit(1);   
   }
   buffer_two = clCreateBuffer(context, CL_MEM_READ_WRITE | 
         CL_MEM_COPY_HOST_PTR, sizeof(data_two), data_two, NULL);

   /* Set buffers as arguments to the kernel */
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer_one);
   err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer_two);
   if(err < 0) {
      perror("Couldn't set the buffer as the kernel argument");
      exit(1);   
   }

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   err = clEnqueueTask(queue, kernel, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }

   /* Enqueue command to copy buffer one to buffer two */
   err = clEnqueueCopyBuffer(queue, buffer_one, buffer_two, 0, 0,
         sizeof(data_one), 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn't perform the buffer copy");
      exit(1);   
   }

   /* Enqueue command to map buffer two to host memory */
   mapped_memory = clEnqueueMapBuffer(queue, buffer_two, CL_TRUE,
         CL_MAP_READ, 0, sizeof(data_two), 0, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't map the buffer to host memory");
      exit(1);   
   }

   /* Transfer memory and unmap the buffer */
   memcpy(result_array, mapped_memory, sizeof(data_two));
   err = clEnqueueUnmapMemObject(queue, buffer_two, mapped_memory,
         0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't unmap the buffer");
      exit(1);   
   }

   /* Display updated buffer */
   for(i=0; i<10; i++) {
      for(j=0; j<10; j++) {
         printf("%6.1f", result_array[j+i*10]);
      }
      printf("\n");
   }

   /* Deallocate resources */
   clReleaseMemObject(buffer_one);
   clReleaseMemObject(buffer_two);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);

   return 0;
}
Esempio n. 17
0
float sgemmMain(int rowa,int cola,int colb)
{
	 cl_context context = 0;
	 cl_command_queue commandQueue = 0;
	 cl_program program = 0;
	 cl_device_id device = 0;
	 cl_kernel kernel = 0;
	 const unsigned int numberOfMemoryObjects = 3;
	 cl_mem memoryObjectsa = 0;
	 cl_mem memoryObjectsb = 0;
	 cl_mem memoryObjectsc = 0;
	 cl_int errorNumber;
	 cl_uint clrowa = rowa;
	 cl_uint clcola = cola;
	 cl_uint clcolb = colb;
	 int err;
	 err = createContext(&context);
	 LOGD("create context");
	 err = createCommandQueue(context, &commandQueue, &device);
	 err = createProgram(context, device, "/mnt/sdcard/kernel/sgemm.cl", &program);
	 kernel = clCreateKernel(program, "sgemm", &errorNumber);
	 LOGD("createKernel code %d",errorNumber);
	 LOGD("start computing");
	 float alpha = 1;
	 float beta = 0.1;

	 /* Create the matrices. */
	 size_t matrixSizea = rowa * cola;
	 size_t matrixSizeb = cola * colb;
	 size_t matrixSizec = rowa * colb;

	 /* As all the matrices have the same size, the buffer size is common. */
	 size_t bufferSizea = matrixSizea * sizeof(float);
	 size_t bufferSizeb = matrixSizeb * sizeof(float);
	 size_t bufferSizec = matrixSizec * sizeof(float);

	 /* Create buffers for the matrices used in the kernel. */
	 int createMemoryObjectsSuccess = 0;
	 memoryObjectsa = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizea, NULL, &errorNumber);
	 createMemoryObjectsSuccess &= errorNumber;
	 memoryObjectsb = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizeb, NULL, &errorNumber);
	 createMemoryObjectsSuccess &= errorNumber;
	 memoryObjectsc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizec, NULL, &errorNumber);
	 createMemoryObjectsSuccess &= errorNumber;
	 LOGD("create memory err %d",createMemoryObjectsSuccess);
	 int mapMemoryObjectsSuccess = 0;
	 cl_float* matrixA = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsa, CL_TRUE, CL_MAP_WRITE, 0, bufferSizea, 0, NULL, NULL, &errorNumber);
	 mapMemoryObjectsSuccess &= errorNumber;
	 cl_float* matrixB = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsb, CL_TRUE, CL_MAP_WRITE, 0, bufferSizeb, 0, NULL, NULL, &errorNumber);
	 mapMemoryObjectsSuccess &= errorNumber;
	 cl_float* matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_WRITE, 0, bufferSizec, 0, NULL, NULL, &errorNumber);
	 mapMemoryObjectsSuccess &= errorNumber;
	 LOGD("map memory err %d",mapMemoryObjectsSuccess);

	 sgemmInitialize(rowa,cola,colb, matrixA, matrixB, matrixC);
	 LOGD("data initial finish");
	 int unmapMemoryObjectsSuccess = 0;
	 errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsa, matrixA, 0, NULL, NULL);
	 LOGD("memory code %d",errorNumber);
	 unmapMemoryObjectsSuccess &= errorNumber;
	 errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsb, matrixB, 0, NULL, NULL);
	 LOGD("memory code %d",errorNumber);
	 unmapMemoryObjectsSuccess &= errorNumber;
	 errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL);
	 LOGD("memory code %d",errorNumber);
	 unmapMemoryObjectsSuccess &= errorNumber;
	 LOGD("unmap memory err %d",unmapMemoryObjectsSuccess);

	 int setKernelArgumentsSuccess = 0;
	 errorNumber = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjectsa);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjectsb);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjectsc);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 3, sizeof(cl_uint), &clrowa);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 4, sizeof(cl_uint), &clcola);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 5, sizeof(cl_uint), &clcolb);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 6, sizeof(cl_float), &alpha);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 7, sizeof(cl_float), &beta);
	 setKernelArgumentsSuccess &= errorNumber;
	 LOGD("setKernel err %d",setKernelArgumentsSuccess);

	 LOGD("start running kernel");
	 clock_t start_t,end_t;
	 float cost_time;
	 start_t = clock();
	 cl_event event = 0;
	 size_t globalWorksize[2] = {rowa, colb};
	 errorNumber = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorksize, NULL, 0, NULL, &event);
	 //LOGD("Enqueue err code %d",errorNumber);
	 errorNumber = clFinish(commandQueue);
	 end_t = clock();
	 cost_time = (float)(end_t-start_t)/CLOCKS_PER_SEC*1000;
	 LOGD("Finish err code %d",errorNumber);
	 float time;
	 time = printProfilingInfo(event);
	 LOGT("using CPU clock: %f ms",cost_time);
	 LOGT("using GPU clock: %f ms",time);
	 clReleaseEvent(event);
	 matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_READ, 0, bufferSizec, 0, NULL, NULL, &errorNumber);
	 clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL);
	 LOGD("read out matrixC finish");
	 LOGD("matrixC value C(0,0): %f",matrixC[0]);
	 cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjectsa, memoryObjectsb,memoryObjectsc,numberOfMemoryObjects);
	 LOGD("RUNNING finsh");
	 return time;
}
Esempio n. 18
0
inline void vector_sum(const int  arraySize, 
                       const double* inputA, 
                       const double* inputB, 
                             double* output)
{ 
    /* Allocate memory buffers */
    /*
    * Ask the OpenCL implementation to allocate buffers for the data.
    * We ask the OpenCL implemenation to allocate memory rather than 
    * allocating it on the CPU to avoid having to copy the data later.
    * The read/write flags relate to accesses to the memory from within 
    * the kernel.
    */

    bool createMemoryObjectSuccess = true;
    int numberOfMemoryObjects = 3;
    cl_mem memoryObjects[3] = {0, 0, 0};
    int errorNumber = 0;

    int bufferSize = arraySize*sizeof(double);

    memoryObjects[0] = clCreateBuffer(context, 
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, (void*)inputA, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 1.");
    
    memoryObjects[1] = clCreateBuffer(context, 
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, (void*)inputB, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 2.");
    
    memoryObjects[2] = clCreateBuffer(context, 
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, output, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 3.");

    /* Enqueue commands and kernels */
    /* Enqueue to the command queues the commands that control the sequence 
     * and synchronization of kernel execution, reading and writing of data,
     * and manipulation of memory objects
     */

    /* Execute a kernel function */
    /* Call clSetKernelArg() for each parameter in the kernel */
    bool setKernelArgumentsSuccess = true;
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, 
                                        sizeof(cl_mem), &memoryObjects[0]));
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, 
                                        sizeof(cl_mem), &memoryObjects[1]));
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, 
                                        sizeof(cl_mem), &memoryObjects[2]));
    if (not setKernelArgumentsSuccess) {
        cleanUpOpenCL();
        std::cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ 
                  << ":"<< __LINE__ << std::endl;
        exit(1);
    }

    /* Determine the work-group size and index space for the kernel */
    const size_t globalWorkSize[1] = {arraySize};
    const size_t localWorkSize[1] = { 1 };

    /* Enqueue the kernel for execution in the command queue */
    //for (int j = 0; j < ITER; j++) {
        if (not checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, 
                NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))) {
            
            cleanUpOpenCL();
            std::cerr << "Failed enqueuing the kernel. " << __FILE__ << ":" 
                      << __LINE__ <<std::endl;
            exit(1);
        }
    //}

    /* Get a pointer to the output data */
    output = (double*)clEnqueueMapBuffer(commandQueue, 
                    memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, 
                    arraySize, 0, NULL, NULL, &errorNumber);

    if (not checkSuccess(errorNumber)) {

        cleanUpOpenCL();
        std::cerr << "Failed to map buffer " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1); 
    }

    /* Wait for kernel execution */
    if (not checkSuccess(clFinish(commandQueue))) {

        cleanUpOpenCL();
        std::cerr << "Failed waiting for kernel execution to finish. "
                  << __FILE__ << ":"<< __LINE__ << std::endl;
        exit(1);
    }


    /* Unmap the memory objects as we finished using them in the CPU */
    if (not checkSuccess(clReleaseMemObject(memoryObjects[0]))) {

        cleanUpOpenCL();
        std::cerr << "Unmapping memory objects failed " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1);
    }
    if (not checkSuccess(clReleaseMemObject(memoryObjects[1]))) {

        cleanUpOpenCL();
        std::cerr << "Unmapping memory objects failed " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1);
    }
    if (not checkSuccess(clEnqueueUnmapMemObject(commandQueue, 
                    memoryObjects[2], output, 0, NULL, NULL))) {

        cleanUpOpenCL();
        std::cerr << "Unmapping memory objects failed " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1);
    }
}