/* Print a statement for copying an array to or from the device, * or for initializing or clearing the device. * The statement identifier of a copying node is called * "to_device_<array name>" or "from_device_<array name>" and * its user pointer points to the gpu_array_info of the array * that needs to be copied. * The node for initializing the device is called "init_device". * The node for clearing the device is called "clear_device". * * Extract the array (if any) from the identifier and call * init_device, clear_device, copy_array_to_device or copy_array_from_device. */ static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p, __isl_keep isl_ast_node *node, struct gpu_prog *prog) { isl_ast_expr *expr, *arg; isl_id *id; const char *name; struct gpu_array_info *array; expr = isl_ast_node_user_get_expr(node); arg = isl_ast_expr_get_op_arg(expr, 0); id = isl_ast_expr_get_id(arg); name = isl_id_get_name(id); array = isl_id_get_user(id); isl_id_free(id); isl_ast_expr_free(arg); isl_ast_expr_free(expr); if (!name) return isl_printer_free(p); if (!strcmp(name, "init_device")) return init_device(p, prog); if (!strcmp(name, "clear_device")) return clear_device(p, prog); if (!array) return isl_printer_free(p); if (!prefixcmp(name, "to_device")) return copy_array_to_device(p, array); else return copy_array_from_device(p, array); }
/* This function is called for each user statement in the AST, * i.e., for each kernel body statement, copy statement or sync statement. */ static __isl_give isl_printer *opencl_print_kernel_stmt( __isl_take isl_printer *p, __isl_take isl_ast_print_options *print_options, __isl_keep isl_ast_node *node, void *user) { isl_id *id; struct ppcg_kernel_stmt *stmt; id = isl_ast_node_get_annotation(node); stmt = isl_id_get_user(id); isl_id_free(id); isl_ast_print_options_free(print_options); switch (stmt->type) { case ppcg_kernel_copy: return ppcg_kernel_print_copy(p, stmt); case ppcg_kernel_sync: return opencl_print_sync(p, stmt); case ppcg_kernel_domain: return print_opencl_kernel_domain(p, stmt); } return p; }
/* Print a list of iterators of type "type" with names "ids" to "p". * Each iterator is assigned the corresponding opencl identifier returned * by the function "opencl_id". * Unlike the equivalent function in the CUDA backend which prints iterators * in reverse order to promote coalescing, this function does not print * iterators in reverse order. The OpenCL backend currently does not take * into account any coalescing considerations. */ static __isl_give isl_printer *print_iterators(__isl_take isl_printer *p, const char *type, __isl_keep isl_id_list *ids, const char *opencl_id) { int i, n; n = isl_id_list_n_id(ids); if (n <= 0) return p; p = isl_printer_start_line(p); p = isl_printer_print_str(p, type); p = isl_printer_print_str(p, " "); for (i = 0; i < n; ++i) { isl_id *id; if (i) p = isl_printer_print_str(p, ", "); id = isl_id_list_get_id(ids, i); p = isl_printer_print_id(p, id); isl_id_free(id); p = isl_printer_print_str(p, " = "); p = isl_printer_print_str(p, opencl_id); p = isl_printer_print_str(p, "("); p = isl_printer_print_int(p, i); p = isl_printer_print_str(p, ")"); } p = isl_printer_print_str(p, ";"); p = isl_printer_end_line(p); return p; }
CloogInput *Cloog::buildCloogInput() { // XXX: We do not copy the context of the scop, but use an unconstrained // context. This 'hack' is necessary as the context may contain bounds // on parameters such as [n] -> {:0 <= n < 2^32}. Those large // integers will cause CLooG to construct a clast that contains // expressions that include these large integers. Such expressions can // possibly not be evaluated correctly with i64 types. The cloog // based code generation backend, however, can not derive types // automatically and just assumes i64 types. Hence, it will break or // generate incorrect code. // This hack does not remove all possibilities of incorrectly generated // code, but it is ensures that for most problems the problems do not // show up. The correct solution, will be to automatically derive the // minimal types for each expression. This could be added to CLooG and it // will be available in the isl based code generation. isl_set *EmptyContext = isl_set_universe(S->getParamSpace()); CloogDomain *Context = cloog_domain_from_isl_set(EmptyContext); CloogUnionDomain *Statements = buildCloogUnionDomain(); isl_set *ScopContext = S->getContext(); for (unsigned i = 0; i < isl_set_dim(ScopContext, isl_dim_param); i++) { isl_id *id = isl_set_get_dim_id(ScopContext, isl_dim_param, i); Statements = cloog_union_domain_set_name(Statements, CLOOG_PARAM, i, isl_id_get_name(id)); isl_id_free(id); } isl_set_free(ScopContext); CloogInput *Input = cloog_input_alloc(Context, Statements); return Input; }
expression_ptr cpp_from_isl::process_expr(isl_ast_expr * ast_expr) { expression_ptr expr; auto type = isl_ast_expr_get_type(ast_expr); switch(type) { case isl_ast_expr_op: { expr = process_op(ast_expr); break; } case isl_ast_expr_id: { auto id = isl_ast_expr_get_id(ast_expr); string name(isl_id_get_name(id)); isl_id_free(id); if (m_id_func) expr = m_id_func(name); if (!expr) expr = make_shared<id_expression>(name); break; } case isl_ast_expr_int: { auto val = isl_ast_expr_get_val(ast_expr); if (isl_val_is_int(val) != isl_bool_true) throw error("Value is not an integer."); int ival = isl_val_get_num_si(val); isl_val_free(val); expr = literal(ival); break; } default: throw error("Unexpected AST expression type."); } return expr; }
/* Print a list of iterators of type "type" with names "ids" to "out". * Each iterator is assigned one of the cuda identifiers in cuda_dims. * In particular, the last iterator is assigned the x identifier * (the first in the list of cuda identifiers). */ static void print_iterators(FILE *out, const char *type, __isl_keep isl_id_list *ids, const char *cuda_dims[]) { int i, n; n = isl_id_list_n_id(ids); if (n <= 0) return; print_indent(out, 4); fprintf(out, "%s ", type); for (i = 0; i < n; ++i) { isl_id *id; if (i) fprintf(out, ", "); id = isl_id_list_get_id(ids, i); fprintf(out, "%s = %s", isl_id_get_name(id), cuda_dims[n - 1 - i]); isl_id_free(id); } fprintf(out, ";\n"); }
/* Print the user statement of the host code to "p". * * The host code may contain original user statements, kernel launches, * statements that copy data to/from the device and statements * the initialize or clear the device. * The original user statements and the kernel launches have * an associated annotation, while the other statements do not. * The latter are handled by print_device_node. * The annotation on the user statements is called "user". * * In case of a kernel launch, print a block of statements that * defines the grid and the block and then launches the kernel. */ static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p, __isl_take isl_ast_print_options *print_options, __isl_keep isl_ast_node *node, void *user) { isl_id *id; int is_user; struct ppcg_kernel *kernel; struct ppcg_kernel_stmt *stmt; struct print_host_user_data *data; isl_ast_print_options_free(print_options); data = (struct print_host_user_data *) user; id = isl_ast_node_get_annotation(node); if (!id) { //p = isl_printer_print_str(p,"marker_NO_ID_CASE"); return print_device_node(p, node, data->prog); } is_user = !strcmp(isl_id_get_name(id), "user"); kernel = is_user ? NULL : isl_id_get_user(id); stmt = is_user ? isl_id_get_user(id) : NULL; isl_id_free(id); if (is_user) return ppcg_kernel_print_domain(p, stmt); p = ppcg_start_block(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "dim3 k"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "_dimBlock"); print_reverse_list(isl_printer_get_file(p), kernel->n_block, kernel->block_dim); p = isl_printer_print_str(p, ";"); p = isl_printer_end_line(p); p = print_grid(p, kernel); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, " <<<k"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "_dimGrid, k"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "_dimBlock>>> ("); p = print_kernel_arguments(p, data->prog, kernel, 0); p = isl_printer_print_str(p, ");"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "cudaCheckKernel();"); p = isl_printer_end_line(p); p = ppcg_end_block(p); p = isl_printer_start_line(p); p = isl_printer_end_line(p); p = copy_data_from_device_to_device(p,kernel); printf("printing kernel"); print_kernel(data->prog, kernel, data->cuda); printf("printing kernel done"); return p; }
/* Print the user statement of the host code to "p". * * The host code may contain original user statements, kernel launches and * statements that copy data to/from the device. * The original user statements and the kernel launches have * an associated annotation, while the data copy statements do not. * The latter are handled by print_to_from_device. * The annotation on the user statements is called "user". * * In case of a kernel launch, print a block of statements that * defines the grid and the work group and then launches the kernel. * * A grid is composed of many work groups (blocks), each work group holds * many work-items (threads). * * global_work_size[kernel->n_block] represents the total number of work * items. It points to an array of kernel->n_block unsigned * values that describe the total number of work-items that will execute * the kernel. The total number of work-items is computed as: * global_work_size[0] *...* global_work_size[kernel->n_block - 1]. * * The size of each work group (i.e. the number of work-items in each work * group) is described using block_size[kernel->n_block]. The total * number of work-items in a block (work-group) is computed as: * block_size[0] *... * block_size[kernel->n_block - 1]. * * For more information check: * http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html */ static __isl_give isl_printer *opencl_print_host_user( __isl_take isl_printer *p, __isl_take isl_ast_print_options *print_options, __isl_keep isl_ast_node *node, void *user) { isl_id *id; int is_user; struct ppcg_kernel *kernel; struct ppcg_kernel_stmt *stmt; struct print_host_user_data_opencl *data; isl_ast_print_options_free(print_options); data = (struct print_host_user_data_opencl *) user; id = isl_ast_node_get_annotation(node); if (!id) return print_to_from_device(p, node, data->prog); is_user = !strcmp(isl_id_get_name(id), "user"); kernel = is_user ? NULL : isl_id_get_user(id); stmt = is_user ? isl_id_get_user(id) : NULL; isl_id_free(id); if (is_user) return ppcg_kernel_print_domain(p, stmt); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "{"); p = isl_printer_end_line(p); p = isl_printer_indent(p, 2); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "size_t global_work_size["); if (kernel->n_block > 0) p = isl_printer_print_int(p, kernel->n_block); else p = isl_printer_print_int(p, 1); p = isl_printer_print_str(p, "] = {"); p = opencl_print_total_number_of_work_items_as_list(p, kernel); p = isl_printer_print_str(p, "};"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "size_t block_size["); if (kernel->n_block > 0) p = isl_printer_print_int(p, kernel->n_block); else p = isl_printer_print_int(p, 1); p = isl_printer_print_str(p, "] = {"); p = opencl_print_block_sizes(p, kernel); p = isl_printer_print_str(p, "};"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "cl_kernel kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, " = clCreateKernel(program, \"kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "\", &err);"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "openclCheckReturn(err);"); p = isl_printer_end_line(p); opencl_set_kernel_arguments(p, data->prog, kernel); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "openclCheckReturn(clEnqueueNDRangeKernel" "(queue, kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, ", "); if (kernel->n_block > 0) p = isl_printer_print_int(p, kernel->n_block); else p = isl_printer_print_int(p, 1); p = isl_printer_print_str(p, ", NULL, global_work_size, " "block_size, " "0, NULL, NULL));"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "openclCheckReturn(" "clReleaseKernel(kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "));"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "clFinish(queue);"); p = isl_printer_end_line(p); p = isl_printer_indent(p, -2); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "}"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_end_line(p); data->opencl->kprinter = opencl_print_kernel(data->prog, kernel, data->opencl->kprinter); return p; }
isl_ast_node * ast_gen::after_for(isl_ast_node *node, isl_ast_build * builder) { if (verbose<ast_gen>::enabled()) cout << "-- After for" << endl; bool is_deepest_loop = m_deepest_loop == m_current_loop; bool is_requested_parallel = false; { auto iter_expr = isl_ast_node_for_get_iterator(node); auto id = isl_ast_expr_get_id(iter_expr); if (verbose<ast_gen>::enabled()) cout << " Loop iter: " << isl_id_get_name(id) << endl; auto data = isl_id_get_user(id); if (data == &m_parallel_loop_id) { if (verbose<ast_gen>::enabled()) cout << " Requested as parallel" << endl; is_requested_parallel = true; } id = isl_id_free(id); isl_ast_expr_free(iter_expr); } auto id = isl_ast_node_get_annotation(node); auto info = ast_node_info::get_from_id(id); // Mark loop parallel if parallelizable and // either requested by user or outermost parallizable. if (!m_options.parallel) { if (verbose<ast_gen>::enabled()) cout << " Explicit parallelization not enabled." << endl; } else if (!info->is_parallelizable) { if (verbose<ast_gen>::enabled()) cout << " Not parallelizable." << endl; } else if (m_options.parallel_dim < 0) { if (m_num_parallelizable_loops != 1) { if (verbose<ast_gen>::enabled()) cout << " Not the outermost parallelizable loop." << endl; } else { info->is_parallel = true; } } else { if (!is_requested_parallel) { if (verbose<ast_gen>::enabled()) cout << " Not the requested parallel loop." << endl; } else { if (verbose<ast_gen>::enabled()) cout << " Parallelized." << endl; info->is_parallel = true; } } // Mark loop vectorized if parallelizable and deepest. if (m_options.vectorize && is_deepest_loop && info->is_parallelizable) { if (verbose<ast_gen>::enabled()) cout << "-- Loop vectorized." << endl; info->is_vector = true; } if (info->is_parallel || info->is_vector) { store_parallel_accesses_for_current_dimension(builder); } if (info->is_parallelizable) --m_num_parallelizable_loops; --m_current_loop; id = isl_id_free(id); return node; }