Ejemplo n.º 1
0
/* Given a gpu_prog "prog" and the corresponding transformed AST
 * "tree", print the entire OpenCL code to "p".
 */
static __isl_give isl_printer *print_opencl(__isl_take isl_printer *p,
	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
	struct gpu_types *types, void *user)
{
	struct opencl_info *opencl = user;

	opencl->kprinter = isl_printer_set_output_format(opencl->kprinter,
							ISL_FORMAT_C);
	if (any_double_elements(prog))
		opencl->kprinter = opencl_enable_double_support(
							opencl->kprinter);
	if (opencl->options->opencl_print_kernel_types)
		opencl->kprinter = gpu_print_types(opencl->kprinter, types,
								prog);

	if (!opencl->kprinter)
		return isl_printer_free(p);

	p = ppcg_start_block(p);

	p = opencl_print_host_macros(p);

	p = gpu_print_local_declarations(p, prog);
	p = opencl_declare_device_arrays(p, prog);
	p = opencl_setup(p, opencl->input, opencl);
	p = opencl_allocate_device_arrays(p, prog);

	p = opencl_print_host_code(p, prog, tree, opencl);

	p = opencl_release_device_arrays(p, prog);
	p = opencl_release_cl_objects(p, opencl);

	p = ppcg_end_block(p);

	return p;
}
Ejemplo n.º 2
0
void mat_mul(array_2D matrix_A, array_2D matrix_B, array_2D &matrix_C, int n_blocks )
{
    CLEnv clenv;
    opencl_setup(clenv);

    unsigned int size_A = matrix_A[0].size();
    unsigned int size_B = matrix_B[0].size();
    unsigned int size_C = matrix_C[0].size();

    unsigned int mem_size_A = sizeof(double) * size_A;
    unsigned int mem_size_B = sizeof(double) * size_B;
    unsigned int mem_size_C = sizeof(double) * size_C;

    unsigned int w_C;

    if( size_B == size_C )
    {
        if ( size_B == size_A )
        {
            w_C = sqrt(size_B);
        }
        else
        {
            w_C = 1;
        }
    }
    else
    {
        w_C= largest_divider(size_B,size_C);
    }

    unsigned int h_C = size_C / w_C;
    unsigned int w_B = w_C;
    unsigned int h_B = size_B / w_B;
    unsigned int w_A = h_B;
    unsigned int h_A = size_A / w_A;

    // OpenCL device memory for matrices
    cl_mem d_A;
    cl_mem d_B;
    cl_mem d_C;

  // Setup device memory
    d_A = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY, mem_size_A, NULL, &clenv.errcode);
    assert(clenv.errcode == CL_SUCCESS);

    d_B = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY, mem_size_B, NULL, &clenv.errcode);
    assert(clenv.errcode == CL_SUCCESS);

    d_C = clCreateBuffer(clenv.context, CL_MEM_WRITE_ONLY, mem_size_C, NULL, &clenv.errcode);
    assert(clenv.errcode == CL_SUCCESS);

    // 7. Launch OpenCL kernel
    size_t localWorkSize[2], globalWorkSize[2];

    int wA = w_A;//WA;
    int wC = w_C;//WC;

    localWorkSize [0] = 16;
    localWorkSize [1] = 16;
    globalWorkSize[0] = h_A;
    globalWorkSize[1] = w_B;


  // 8. Retrieve result from device


    for( int idx = 0; idx < n_blocks; idx++ )
    {
         std::cout<<idx<<std::endl;
         double h_aI[matrix_A[idx].size()]; for( int i = 0; i < matrix_A[idx].size(); i++ ) h_aI[i] = matrix_A[idx][i];
         double h_bI[matrix_B[idx].size()]; for( int i = 0; i < matrix_B[idx].size(); i++ ) h_bI[i] = matrix_B[idx][i];
         double h_cI[matrix_C[idx].size()];

         clenv.errcode  = clSetKernelArg(clenv.kernel, 0, sizeof(cl_mem), (void *)&d_C);
         clenv.errcode |= clSetKernelArg(clenv.kernel, 1, sizeof(cl_mem), (void *)&d_A);
         clenv.errcode |= clSetKernelArg(clenv.kernel, 2, sizeof(cl_mem), (void *)&d_B);
         clenv.errcode |= clSetKernelArg(clenv.kernel, 3, sizeof(int),    (void *)&wA);
         clenv.errcode |= clSetKernelArg(clenv.kernel, 4, sizeof(int),    (void *)&wC);
         opencl_check_error(clenv.errcode, CL_SUCCESS, __FILE__ , __LINE__ );

         clenv.errcode = clEnqueueWriteBuffer(clenv.command_queue, d_A, CL_FALSE, 0, sizeof(cl_double) * matrix_A[idx].size(), h_aI, 0, NULL, NULL);
         clenv.errcode = clEnqueueWriteBuffer(clenv.command_queue, d_B, CL_FALSE, 0, sizeof(cl_double) * matrix_B[idx].size(), h_bI, 0, NULL, NULL);

         //ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);

         clenv.errcode = clEnqueueNDRangeKernel(clenv.command_queue, clenv.kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
         opencl_check_error(clenv.errcode, CL_SUCCESS, __FILE__ , __LINE__ );

         clenv.errcode = clEnqueueReadBuffer(clenv.command_queue, d_C, CL_TRUE, 0, mem_size_C, h_cI, 0, NULL, NULL);
         opencl_check_error(clenv.errcode, CL_SUCCESS, __FILE__ , __LINE__ );

         for( int i = 0; i < matrix_C[idx].size(); i++ ) matrix_C[idx][i] = h_cI[i];
    }

    opencl_unsetup(clenv);
}