CloogInput *Cloog::buildCloogInput() { // XXX: We do not copy the context of the scop, but use an unconstrained // context. This 'hack' is necessary as the context may contain bounds // on parameters such as [n] -> {:0 <= n < 2^32}. Those large // integers will cause CLooG to construct a clast that contains // expressions that include these large integers. Such expressions can // possibly not be evaluated correctly with i64 types. The cloog // based code generation backend, however, can not derive types // automatically and just assumes i64 types. Hence, it will break or // generate incorrect code. // This hack does not remove all possibilities of incorrectly generated // code, but it is ensures that for most problems the problems do not // show up. The correct solution, will be to automatically derive the // minimal types for each expression. This could be added to CLooG and it // will be available in the isl based code generation. isl_set *EmptyContext = isl_set_universe(S->getParamSpace()); CloogDomain *Context = cloog_domain_from_isl_set(EmptyContext); CloogUnionDomain *Statements = buildCloogUnionDomain(); isl_set *ScopContext = S->getContext(); for (unsigned i = 0; i < isl_set_dim(ScopContext, isl_dim_param); i++) { isl_id *id = isl_set_get_dim_id(ScopContext, isl_dim_param, i); Statements = cloog_union_domain_set_name(Statements, CLOOG_PARAM, i, isl_id_get_name(id)); isl_id_free(id); } isl_set_free(ScopContext); CloogInput *Input = cloog_input_alloc(Context, Statements); return Input; }
/* Print a statement for copying an array to or from the device, * or for initializing or clearing the device. * The statement identifier of a copying node is called * "to_device_<array name>" or "from_device_<array name>" and * its user pointer points to the gpu_array_info of the array * that needs to be copied. * The node for initializing the device is called "init_device". * The node for clearing the device is called "clear_device". * * Extract the array (if any) from the identifier and call * init_device, clear_device, copy_array_to_device or copy_array_from_device. */ static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p, __isl_keep isl_ast_node *node, struct gpu_prog *prog) { isl_ast_expr *expr, *arg; isl_id *id; const char *name; struct gpu_array_info *array; expr = isl_ast_node_user_get_expr(node); arg = isl_ast_expr_get_op_arg(expr, 0); id = isl_ast_expr_get_id(arg); name = isl_id_get_name(id); array = isl_id_get_user(id); isl_id_free(id); isl_ast_expr_free(arg); isl_ast_expr_free(expr); if (!name) return isl_printer_free(p); if (!strcmp(name, "init_device")) return init_device(p, prog); if (!strcmp(name, "clear_device")) return clear_device(p, prog); if (!array) return isl_printer_free(p); if (!prefixcmp(name, "to_device")) return copy_array_to_device(p, array); else return copy_array_from_device(p, array); }
expression_ptr cpp_from_isl::process_expr(isl_ast_expr * ast_expr) { expression_ptr expr; auto type = isl_ast_expr_get_type(ast_expr); switch(type) { case isl_ast_expr_op: { expr = process_op(ast_expr); break; } case isl_ast_expr_id: { auto id = isl_ast_expr_get_id(ast_expr); string name(isl_id_get_name(id)); isl_id_free(id); if (m_id_func) expr = m_id_func(name); if (!expr) expr = make_shared<id_expression>(name); break; } case isl_ast_expr_int: { auto val = isl_ast_expr_get_val(ast_expr); if (isl_val_is_int(val) != isl_bool_true) throw error("Value is not an integer."); int ival = isl_val_get_num_si(val); isl_val_free(val); expr = literal(ival); break; } default: throw error("Unexpected AST expression type."); } return expr; }
/* Print a list of iterators of type "type" with names "ids" to "out". * Each iterator is assigned one of the cuda identifiers in cuda_dims. * In particular, the last iterator is assigned the x identifier * (the first in the list of cuda identifiers). */ static void print_iterators(FILE *out, const char *type, __isl_keep isl_id_list *ids, const char *cuda_dims[]) { int i, n; n = isl_id_list_n_id(ids); if (n <= 0) return; print_indent(out, 4); fprintf(out, "%s ", type); for (i = 0; i < n; ++i) { isl_id *id; if (i) fprintf(out, ", "); id = isl_id_list_get_id(ids, i); fprintf(out, "%s = %s", isl_id_get_name(id), cuda_dims[n - 1 - i]); isl_id_free(id); } fprintf(out, ";\n"); }
instruction_list * isl_user_to_noclock (isl_ast_node * user_node) { isl_ast_expr * expr = isl_ast_node_user_get_expr (user_node); instruction * user = instruction_alloc (); user->type = INSTR_CALL; user->content.call.identifier = strdup (isl_id_get_name ( isl_ast_expr_get_id (isl_ast_expr_get_op_arg (expr, 0)))); for (int i = 1; i < isl_ast_expr_get_op_n_arg (expr); ++i) { expression_list * e = expression_list_alloc (); e->element = isl_expr_to_noclock_expr (isl_ast_expr_get_op_arg (expr, i)); e->next = NULL; user->content.call.arguments = expression_list_cat ( user->content.call.arguments, e); } instruction_list * list = instruction_list_alloc (); list->element = user; list->next = NULL; return list; }
instruction_list * isl_for_to_noclock (isl_ast_node * for_node) { /* Extract the for loop information. */ isl_ast_expr * iterator = isl_ast_node_for_get_iterator (for_node); isl_id * id = isl_ast_expr_get_id (iterator); isl_ast_expr * init = isl_ast_node_for_get_init (for_node); isl_ast_expr * cond = isl_ast_node_for_get_cond (for_node); isl_ast_node * body = isl_ast_node_for_get_body (for_node); /* Construct the for loop. */ instruction * loop = instruction_for_loop ( strdup (isl_id_get_name (id)), isl_init_to_expr (init), isl_cond_to_expr (cond), isl_ast_to_noclock_ast (body)); /* Wrap the loop in an instruction list node. */ instruction_list * list = instruction_list_alloc (); list->element = loop; list->next = NULL; return list; }
/* Print the user statement of the host code to "p". * * The host code may contain original user statements, kernel launches, * statements that copy data to/from the device and statements * the initialize or clear the device. * The original user statements and the kernel launches have * an associated annotation, while the other statements do not. * The latter are handled by print_device_node. * The annotation on the user statements is called "user". * * In case of a kernel launch, print a block of statements that * defines the grid and the block and then launches the kernel. */ static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p, __isl_take isl_ast_print_options *print_options, __isl_keep isl_ast_node *node, void *user) { isl_id *id; int is_user; struct ppcg_kernel *kernel; struct ppcg_kernel_stmt *stmt; struct print_host_user_data *data; isl_ast_print_options_free(print_options); data = (struct print_host_user_data *) user; id = isl_ast_node_get_annotation(node); if (!id) { //p = isl_printer_print_str(p,"marker_NO_ID_CASE"); return print_device_node(p, node, data->prog); } is_user = !strcmp(isl_id_get_name(id), "user"); kernel = is_user ? NULL : isl_id_get_user(id); stmt = is_user ? isl_id_get_user(id) : NULL; isl_id_free(id); if (is_user) return ppcg_kernel_print_domain(p, stmt); p = ppcg_start_block(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "dim3 k"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "_dimBlock"); print_reverse_list(isl_printer_get_file(p), kernel->n_block, kernel->block_dim); p = isl_printer_print_str(p, ";"); p = isl_printer_end_line(p); p = print_grid(p, kernel); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, " <<<k"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "_dimGrid, k"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "_dimBlock>>> ("); p = print_kernel_arguments(p, data->prog, kernel, 0); p = isl_printer_print_str(p, ");"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "cudaCheckKernel();"); p = isl_printer_end_line(p); p = ppcg_end_block(p); p = isl_printer_start_line(p); p = isl_printer_end_line(p); p = copy_data_from_device_to_device(p,kernel); printf("printing kernel"); print_kernel(data->prog, kernel, data->cuda); printf("printing kernel done"); return p; }
/* Print the user statement of the host code to "p". * * The host code may contain original user statements, kernel launches and * statements that copy data to/from the device. * The original user statements and the kernel launches have * an associated annotation, while the data copy statements do not. * The latter are handled by print_to_from_device. * The annotation on the user statements is called "user". * * In case of a kernel launch, print a block of statements that * defines the grid and the work group and then launches the kernel. * * A grid is composed of many work groups (blocks), each work group holds * many work-items (threads). * * global_work_size[kernel->n_block] represents the total number of work * items. It points to an array of kernel->n_block unsigned * values that describe the total number of work-items that will execute * the kernel. The total number of work-items is computed as: * global_work_size[0] *...* global_work_size[kernel->n_block - 1]. * * The size of each work group (i.e. the number of work-items in each work * group) is described using block_size[kernel->n_block]. The total * number of work-items in a block (work-group) is computed as: * block_size[0] *... * block_size[kernel->n_block - 1]. * * For more information check: * http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html */ static __isl_give isl_printer *opencl_print_host_user( __isl_take isl_printer *p, __isl_take isl_ast_print_options *print_options, __isl_keep isl_ast_node *node, void *user) { isl_id *id; int is_user; struct ppcg_kernel *kernel; struct ppcg_kernel_stmt *stmt; struct print_host_user_data_opencl *data; isl_ast_print_options_free(print_options); data = (struct print_host_user_data_opencl *) user; id = isl_ast_node_get_annotation(node); if (!id) return print_to_from_device(p, node, data->prog); is_user = !strcmp(isl_id_get_name(id), "user"); kernel = is_user ? NULL : isl_id_get_user(id); stmt = is_user ? isl_id_get_user(id) : NULL; isl_id_free(id); if (is_user) return ppcg_kernel_print_domain(p, stmt); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "{"); p = isl_printer_end_line(p); p = isl_printer_indent(p, 2); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "size_t global_work_size["); if (kernel->n_block > 0) p = isl_printer_print_int(p, kernel->n_block); else p = isl_printer_print_int(p, 1); p = isl_printer_print_str(p, "] = {"); p = opencl_print_total_number_of_work_items_as_list(p, kernel); p = isl_printer_print_str(p, "};"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "size_t block_size["); if (kernel->n_block > 0) p = isl_printer_print_int(p, kernel->n_block); else p = isl_printer_print_int(p, 1); p = isl_printer_print_str(p, "] = {"); p = opencl_print_block_sizes(p, kernel); p = isl_printer_print_str(p, "};"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "cl_kernel kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, " = clCreateKernel(program, \"kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "\", &err);"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "openclCheckReturn(err);"); p = isl_printer_end_line(p); opencl_set_kernel_arguments(p, data->prog, kernel); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "openclCheckReturn(clEnqueueNDRangeKernel" "(queue, kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, ", "); if (kernel->n_block > 0) p = isl_printer_print_int(p, kernel->n_block); else p = isl_printer_print_int(p, 1); p = isl_printer_print_str(p, ", NULL, global_work_size, " "block_size, " "0, NULL, NULL));"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "openclCheckReturn(" "clReleaseKernel(kernel"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "));"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "clFinish(queue);"); p = isl_printer_end_line(p); p = isl_printer_indent(p, -2); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "}"); p = isl_printer_end_line(p); p = isl_printer_start_line(p); p = isl_printer_end_line(p); data->opencl->kprinter = opencl_print_kernel(data->prog, kernel, data->opencl->kprinter); return p; }
isl_ast_node * ast_gen::after_for(isl_ast_node *node, isl_ast_build * builder) { if (verbose<ast_gen>::enabled()) cout << "-- After for" << endl; bool is_deepest_loop = m_deepest_loop == m_current_loop; bool is_requested_parallel = false; { auto iter_expr = isl_ast_node_for_get_iterator(node); auto id = isl_ast_expr_get_id(iter_expr); if (verbose<ast_gen>::enabled()) cout << " Loop iter: " << isl_id_get_name(id) << endl; auto data = isl_id_get_user(id); if (data == &m_parallel_loop_id) { if (verbose<ast_gen>::enabled()) cout << " Requested as parallel" << endl; is_requested_parallel = true; } id = isl_id_free(id); isl_ast_expr_free(iter_expr); } auto id = isl_ast_node_get_annotation(node); auto info = ast_node_info::get_from_id(id); // Mark loop parallel if parallelizable and // either requested by user or outermost parallizable. if (!m_options.parallel) { if (verbose<ast_gen>::enabled()) cout << " Explicit parallelization not enabled." << endl; } else if (!info->is_parallelizable) { if (verbose<ast_gen>::enabled()) cout << " Not parallelizable." << endl; } else if (m_options.parallel_dim < 0) { if (m_num_parallelizable_loops != 1) { if (verbose<ast_gen>::enabled()) cout << " Not the outermost parallelizable loop." << endl; } else { info->is_parallel = true; } } else { if (!is_requested_parallel) { if (verbose<ast_gen>::enabled()) cout << " Not the requested parallel loop." << endl; } else { if (verbose<ast_gen>::enabled()) cout << " Parallelized." << endl; info->is_parallel = true; } } // Mark loop vectorized if parallelizable and deepest. if (m_options.vectorize && is_deepest_loop && info->is_parallelizable) { if (verbose<ast_gen>::enabled()) cout << "-- Loop vectorized." << endl; info->is_vector = true; } if (info->is_parallel || info->is_vector) { store_parallel_accesses_for_current_dimension(builder); } if (info->is_parallelizable) --m_num_parallelizable_loops; --m_current_loop; id = isl_id_free(id); return node; }
expression * isl_expr_to_noclock_expr (isl_ast_expr * expr) { expression * e = NULL; enum isl_ast_expr_type expr_t = isl_ast_expr_get_type (expr); if (expr_t == isl_ast_expr_id) { e = expression_from_identifier (isl_id_get_name ( isl_ast_expr_get_id (expr))); return e; } else if (expr_t == isl_ast_expr_int) { e = expression_from_number ( isl_val_get_num_si (isl_ast_expr_get_val (expr))); return e; } bool binary = false; enum isl_ast_op_type t = isl_ast_expr_get_op_type (expr); switch (t) { case isl_ast_op_max: e = expression_alloc (); expression_set_type (e, EXPR_MAX); binary = true; break; case isl_ast_op_min: e = expression_alloc (); expression_set_type (e, EXPR_MIN); binary = true; break; case isl_ast_op_minus: e = expression_alloc (); expression_set_type (e, EXPR_NEG); break; case isl_ast_op_add: e = expression_alloc (); expression_set_type (e, EXPR_ADD); binary = true; break; case isl_ast_op_sub: e = expression_alloc (); expression_set_type (e, EXPR_SUB); binary = true; break; case isl_ast_op_mul: e = expression_alloc (); expression_set_type (e, EXPR_MULT); binary = true; break; case isl_ast_op_div: case isl_ast_op_fdiv_q: case isl_ast_op_pdiv_q: case isl_ast_op_pdiv_r: e = expression_alloc (); expression_set_type (e, EXPR_DIV); binary = true; break; case isl_ast_op_member: case isl_ast_op_cond: case isl_ast_op_select: return e; break; case isl_ast_op_eq: e = expression_alloc (); expression_set_type (e, EXPR_EQ); binary = true; break; case isl_ast_op_le: e = expression_alloc (); expression_set_type (e, EXPR_LE); binary = true; break; case isl_ast_op_lt: e = expression_alloc (); expression_set_type (e, EXPR_LT); binary = true; break; case isl_ast_op_ge: e = expression_alloc (); expression_set_type (e, EXPR_GE); binary = true; break; case isl_ast_op_gt: e = expression_alloc (); expression_set_type (e, EXPR_GT); binary = true; break; case isl_ast_op_and: case isl_ast_op_and_then: e = expression_alloc (); expression_set_type (e, EXPR_AND); binary = true; break; case isl_ast_op_or: case isl_ast_op_or_else: e = expression_alloc (); expression_set_type (e, EXPR_OR); binary = true; break; case isl_ast_op_call: case isl_ast_op_access: case isl_ast_op_address_of: default: return e; break; } if (binary) { expression_set_left_operand (e, isl_expr_to_noclock_expr (isl_ast_expr_get_op_arg (expr, 0))); expression_set_right_operand (e, isl_expr_to_noclock_expr (isl_ast_expr_get_op_arg (expr, 1))); } else { expression_set_left_operand (e, isl_expr_to_noclock_expr (isl_ast_expr_get_op_arg (expr, 0))); } return e; }