/* Given a gpu_prog "prog" and the corresponding transformed AST * "tree", print the entire OpenCL code to "p". */ static __isl_give isl_printer *print_opencl(__isl_take isl_printer *p, struct gpu_prog *prog, __isl_keep isl_ast_node *tree, struct gpu_types *types, void *user) { struct opencl_info *opencl = user; opencl->kprinter = isl_printer_set_output_format(opencl->kprinter, ISL_FORMAT_C); if (any_double_elements(prog)) opencl->kprinter = opencl_enable_double_support( opencl->kprinter); if (opencl->options->opencl_print_kernel_types) opencl->kprinter = gpu_print_types(opencl->kprinter, types, prog); if (!opencl->kprinter) return isl_printer_free(p); p = ppcg_start_block(p); p = opencl_print_host_macros(p); p = gpu_print_local_declarations(p, prog); p = opencl_declare_device_arrays(p, prog); p = opencl_setup(p, opencl->input, opencl); p = opencl_allocate_device_arrays(p, prog); p = opencl_print_host_code(p, prog, tree, opencl); p = opencl_release_device_arrays(p, prog); p = opencl_release_cl_objects(p, opencl); p = ppcg_end_block(p); return p; }
void mat_mul(array_2D matrix_A, array_2D matrix_B, array_2D &matrix_C, int n_blocks ) { CLEnv clenv; opencl_setup(clenv); unsigned int size_A = matrix_A[0].size(); unsigned int size_B = matrix_B[0].size(); unsigned int size_C = matrix_C[0].size(); unsigned int mem_size_A = sizeof(double) * size_A; unsigned int mem_size_B = sizeof(double) * size_B; unsigned int mem_size_C = sizeof(double) * size_C; unsigned int w_C; if( size_B == size_C ) { if ( size_B == size_A ) { w_C = sqrt(size_B); } else { w_C = 1; } } else { w_C= largest_divider(size_B,size_C); } unsigned int h_C = size_C / w_C; unsigned int w_B = w_C; unsigned int h_B = size_B / w_B; unsigned int w_A = h_B; unsigned int h_A = size_A / w_A; // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; // Setup device memory d_A = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY, mem_size_A, NULL, &clenv.errcode); assert(clenv.errcode == CL_SUCCESS); d_B = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY, mem_size_B, NULL, &clenv.errcode); assert(clenv.errcode == CL_SUCCESS); d_C = clCreateBuffer(clenv.context, CL_MEM_WRITE_ONLY, mem_size_C, NULL, &clenv.errcode); assert(clenv.errcode == CL_SUCCESS); // 7. Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = w_A;//WA; int wC = w_C;//WC; localWorkSize [0] = 16; localWorkSize [1] = 16; globalWorkSize[0] = h_A; globalWorkSize[1] = w_B; // 8. Retrieve result from device for( int idx = 0; idx < n_blocks; idx++ ) { std::cout<<idx<<std::endl; double h_aI[matrix_A[idx].size()]; for( int i = 0; i < matrix_A[idx].size(); i++ ) h_aI[i] = matrix_A[idx][i]; double h_bI[matrix_B[idx].size()]; for( int i = 0; i < matrix_B[idx].size(); i++ ) h_bI[i] = matrix_B[idx][i]; double h_cI[matrix_C[idx].size()]; clenv.errcode = clSetKernelArg(clenv.kernel, 0, sizeof(cl_mem), (void *)&d_C); clenv.errcode |= clSetKernelArg(clenv.kernel, 1, sizeof(cl_mem), (void *)&d_A); clenv.errcode |= clSetKernelArg(clenv.kernel, 2, sizeof(cl_mem), (void *)&d_B); clenv.errcode |= clSetKernelArg(clenv.kernel, 3, sizeof(int), (void *)&wA); clenv.errcode |= clSetKernelArg(clenv.kernel, 4, sizeof(int), (void *)&wC); opencl_check_error(clenv.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); clenv.errcode = clEnqueueWriteBuffer(clenv.command_queue, d_A, CL_FALSE, 0, sizeof(cl_double) * matrix_A[idx].size(), h_aI, 0, NULL, NULL); clenv.errcode = clEnqueueWriteBuffer(clenv.command_queue, d_B, CL_FALSE, 0, sizeof(cl_double) * matrix_B[idx].size(), h_bI, 0, NULL, NULL); //ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); clenv.errcode = clEnqueueNDRangeKernel(clenv.command_queue, clenv.kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); opencl_check_error(clenv.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); clenv.errcode = clEnqueueReadBuffer(clenv.command_queue, d_C, CL_TRUE, 0, mem_size_C, h_cI, 0, NULL, NULL); opencl_check_error(clenv.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); for( int i = 0; i < matrix_C[idx].size(); i++ ) matrix_C[idx][i] = h_cI[i]; } opencl_unsetup(clenv); }