d_void * acc_malloc_(size_t device_idx, size_t n) { #if PRINT_INFO printf("[info] acc_malloc_(size_t device_idx = %u, size_t n = %d)\n", device_idx, n); #endif cl_int status; cl_mem buffer = clCreateBuffer( /* cl_context context */ acc_runtime.opencl_data->devices_data[device_idx]->context, /* cl_mem_flags flags */ CL_MEM_READ_WRITE, /* size_t size */ n, /* void *host_ptr */ NULL, /* cl_int *errcode_ret */ &status ); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clCreateBuffer return %s for device %u and size %u.\n", status_str, device_idx, (unsigned)n); exit(-1); /// \todo error code } #if PRINT_INFO printf("[info] return %X\n", buffer); #endif return (d_void *)buffer; }
void acc_free_(size_t device_idx, d_void * dev_ptr) { cl_int status = clReleaseMemObject((cl_mem)dev_ptr); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clReleaseMemObject return %s for device ptr = %x.\n", status_str, dev_ptr); exit(-1); /// \todo error code } }
void acc_memcpy_from_device_(size_t device_idx, h_void * dest, d_void * src, size_t bytes) { cl_event event; cl_int status = clEnqueueReadBuffer ( /* cl_command_queue command_queue */ acc_runtime.opencl_data->devices_data[device_idx]->command_queue, /* cl_mem buffer */ (cl_mem)src, /* cl_bool blocking_read */ CL_FALSE, /* size_t offset */ 0, /* size_t cb */ bytes, /* void *ptr */ dest, /* cl_uint num_events_in_wait_list */ 0, /* const cl_event *event_wait_list */ NULL, /* cl_event *event */ &event ); if (status != CL_SUCCESS) { char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clEnqueueReadBuffer return %s for device ptr = %X to host ptr = %X of size %u.\n", status_str, (unsigned int)src, (unsigned int)dest, (unsigned int)bytes); exit(-1); /// \todo error code } acc_profiling_register_memcpy_from_device(event, device_idx, dest, src, bytes); }
void acc_enqueue_kernel(acc_region_t region, acc_kernel_t kernel) { #if PRINT_INFO printf("[info] acc_enqueue_kernel\n"); #endif unsigned dev_idx; for (dev_idx = 0; dev_idx < region->num_devices; dev_idx++) { assert(region->devices[dev_idx].num_gang > 0); assert(region->devices[dev_idx].num_worker > 0); assert(region->devices[dev_idx].vector_length > 0); size_t device_idx = region->devices[dev_idx].device_idx; assert(acc_runtime.opencl_data->devices_data[device_idx] != NULL); // Create a default context acc_context_t context = acc_create_context(region, kernel, device_idx); // If nothing have to be done on this device the context is NULL. if (context == NULL) continue; // Look for a matching version of the kernel, fill the context according to the selected version cl_kernel ocl_kernel = acc_build_ocl_kernel(region, kernel, context, device_idx); cl_int status; cl_uint idx = 0; unsigned i, j, k, l; // Set params kernel arguments for (i = 0; i < kernel->desc->num_params; i++) { status = clSetKernelArg(ocl_kernel, idx, kernel->desc->size_params[i], kernel->param_ptrs[i]); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%u].kernel[%u] argument %u (scalar #%u).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; } // Set scalar kernel arguments for (i = 0; i < kernel->desc->num_scalars; i++) { status = clSetKernelArg(ocl_kernel, idx, kernel->desc->size_scalars[i], kernel->scalar_ptrs[i]); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%u].kernel[%u] argument %u (scalar #%u).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; } // Set data kernel argument for (i = 0; i < kernel->desc->num_datas; i++) { assert(kernel->data_ptrs[i] != NULL); h_void * h_data_ptr = kernel->data_ptrs[i]; size_t n = kernel->data_size[i]; acc_distributed_data(region, device_idx, &h_data_ptr, &n); d_void * d_data_ptr = acc_deviceptr_(device_idx, h_data_ptr); assert(d_data_ptr != NULL); status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &(d_data_ptr)); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%u].kernel[%u] argument %u (data #%u).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; // if data is distributed need to provide the offset for (j = 0; j < region->desc->num_distributed_data; j++) if (kernel->data_ptrs[i] == region->distributed_data[j].ptr) break; if (j < region->desc->num_distributed_data && region->desc->distributed_data[j].mode != e_all) { #if PRINT_INFO printf("[info] region[%u].kernel[%u] on device #%u data #%u is distributed.\n", region->desc->id, kernel->desc->id, device_idx, i ); #endif assert( region->desc->distributed_data[j].mode == e_contiguous && region->desc->distributed_data[j].nbr_dev == region->num_devices && region->desc->distributed_data[j].portions != NULL ); for (k = 0; k < region->num_devices; k++) if (region->devices[k].device_idx == device_idx) break; assert(k < region->num_devices); unsigned sum_portions = 0; unsigned prev_portion = 0; for (l = 0; l < region->num_devices; l++) { sum_portions += region->desc->distributed_data[j].portions[l]; if (l < k) prev_portion += region->desc->distributed_data[j].portions[l]; }; int offset = (region->distributed_data[j].size * prev_portion) / sum_portions; #if PRINT_INFO printf("[info] sum_portions = %d\n", sum_portions); printf("[info] prev_portion = %d\n", prev_portion); printf("[info] offset = %d\n", offset); #endif status = clSetKernelArg(ocl_kernel, idx, sizeof(int), &offset); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%u].kernel[%u] argument %u: offset for distributed data %u.\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; } } // Allocate/copy context in constant memory \todo alloc only copy before launch with event wait cl_mem ocl_context = clCreateBuffer( acc_runtime.opencl_data->devices_data[device_idx]->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(struct acc_context_t_) + context->num_loop * sizeof(struct acc_kernel_loop_t_), context, &status ); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clCreateBuffer return %s for region[%u].kernel[%u] when call to build the kernel copy of context.\n", status_str, region->desc->id, kernel->desc->id ); exit(-1); /// \todo error code } // Set context of the kernel status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &ocl_context); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%u].kernel[%u] argument %u (context).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; assert(acc_runtime.opencl_data->devices_data[device_idx]->command_queue != NULL); // Launch the kernel size_t global_work_size[1] = { region->devices[dev_idx].num_gang * region->devices[dev_idx].num_worker }; size_t local_work_size[1] = { region->devices[dev_idx].num_worker }; cl_event event; status = clEnqueueNDRangeKernel( acc_runtime.opencl_data->devices_data[device_idx]->command_queue, ocl_kernel, /* cl_uint work_dim = */ 1, /* const size_t * global_work_offset = */ NULL, /* const size_t * global_work_size = */ global_work_size, /* const size_t * local_work_size = */ local_work_size, /* cl_uint num_events_in_wait_list = */ 0, /* const cl_event * event_wait_list = */ NULL, /* cl_event * event = */ &event ); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clEnqueueNDRangeKernel return %s for region[%u].kernel[%u].\n", status_str, region->desc->id, kernel->desc->id ); exit(-1); /// \todo error code } acc_profiling_register_kernel_launch(event, device_idx, region->desc->id, kernel->desc->id); } }
void acc_enqueue_kernel(acc_region_t region, acc_kernel_t kernel) { #if DBG_KERNEL printf("[debug] acc_enqueue_kernel #%zd\n", kernel->desc->id); #endif size_t i, j, k, l, dev_idx; for (dev_idx = 0; dev_idx < region->desc->num_devices; dev_idx++) { assert(region->devices[dev_idx].num_gang[0] > 0); assert(region->devices[dev_idx].num_gang[1] > 0); assert(region->devices[dev_idx].num_gang[2] > 0); assert(region->devices[dev_idx].num_worker[0] > 0); assert(region->devices[dev_idx].num_worker[1] > 0); assert(region->devices[dev_idx].num_worker[2] > 0); assert(region->devices[dev_idx].vector_length > 0); for (i = 0; i < kernel->desc->num_loops; i++) assert(kernel->loops[i].stride != 0); size_t device_idx = region->devices[dev_idx].device_idx; assert(acc_runtime.opencl_data->devices_data[device_idx] != NULL); // Create a default context acc_context_t context; // Look for a matching version of the kernel, fill the context according to the selected version cl_kernel ocl_kernel = acc_build_ocl_kernel(region, kernel, &context, device_idx); cl_int status; cl_uint idx = 0; // Set params kernel arguments for (i = 0; i < kernel->desc->num_params; i++) { size_t size_param = region->desc->size_params[kernel->desc->param_ids[i]]; status = clSetKernelArg(ocl_kernel, idx, size_param, kernel->param_ptrs[i]); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (scalar #%zd).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; } // Set scalar kernel arguments for (i = 0; i < kernel->desc->num_scalars; i++) { size_t size_scalar = region->desc->size_scalars[kernel->desc->scalar_ids[i]]; status = clSetKernelArg(ocl_kernel, idx, size_scalar, kernel->scalar_ptrs[i]); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (scalar #%zd).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; } // Set data kernel argument for (i = 0; i < kernel->desc->num_datas; i++) { assert(kernel->data_ptrs[i] != NULL); h_void * h_data_ptr = kernel->data_ptrs[i]; size_t n = kernel->data_size[i]; acc_distributed_data(region, device_idx, &h_data_ptr, &n); d_void * d_data_ptr = acc_deviceptr_(device_idx, h_data_ptr); if (d_data_ptr == NULL) { printf("[fatal] Cannot find device pointer for %016" PRIxPTR " (%016" PRIxPTR ") on device #%zd for region[%zd].kernel[%zd] argument %u (data #%zd).\n", (uintptr_t)h_data_ptr, (uintptr_t)kernel->data_ptrs[i], device_idx, region->desc->id, kernel->desc->id, idx, i); exit(-1); /// \todo error code } status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &(d_data_ptr)); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (data #%zd).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; // if data is distributed need to provide the offset for (j = 0; j < region->desc->num_distributed_data; j++) if (kernel->desc->data_ids[i] == region->desc->distributed_data[j].id) break; if (j < region->desc->num_distributed_data) { #if DBG_KERNEL printf("[debug] region[%zd].kernel[%zd] on device #%zd data #%zd is distributed.\n", region->desc->id, kernel->desc->id, device_idx, i ); #endif assert( region->desc->distributed_data[j].mode == e_contiguous && region->desc->distributed_data[j].nbr_dev == region->desc->num_devices && region->desc->distributed_data[j].portions != NULL ); for (k = 0; k < region->desc->num_devices; k++) if (region->devices[k].device_idx == device_idx) break; assert(k < region->desc->num_devices); unsigned sum_portions = 0; unsigned prev_portion = 0; for (l = 0; l < region->desc->num_devices; l++) { sum_portions += region->desc->distributed_data[j].portions[l]; if (l < k) prev_portion += region->desc->distributed_data[j].portions[l]; }; int offset = (region->data[kernel->desc->data_ids[i]].nbr_elements_dominant_dimension * prev_portion) / sum_portions; #if DBG_KERNEL printf("[debug] sum_portions = %d\n", sum_portions); printf("[debug] prev_portion = %d\n", prev_portion); printf("[debug] offset = %d\n", offset); #endif status = clSetKernelArg(ocl_kernel, idx, sizeof(int), &offset); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u: offset for distributed data %zd.\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; } } // Set private data kernel argument for (i = 0; i < kernel->desc->num_privates; i++) { status = clSetKernelArg(ocl_kernel, idx, kernel->private_size[i], NULL); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (privatedata #%zd).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; } // Allocate/copy context in constant memory \todo alloc only copy before launch with event wait cl_mem ocl_context = clCreateBuffer( acc_runtime.opencl_data->devices_data[device_idx]->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(struct acc_context_t_) + 2 * (context->num_loops + context->num_tiles) * sizeof(long), context, &status ); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clCreateBuffer return %s for region[%zd].kernel[%zd] when call to build the kernel copy of context.\n", status_str, region->desc->id, kernel->desc->id ); exit(-1); /// \todo error code } free(context); // Not needed anymore // Set context of the kernel status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &ocl_context); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (context).\n", status_str, region->desc->id, kernel->desc->id, idx, i ); exit(-1); /// \todo error code } idx++; assert(acc_runtime.opencl_data->devices_data[device_idx]->command_queue != NULL); // Launch the kernel size_t work_dim = 0; if (region->devices[dev_idx].num_gang[2] > 1 || region->devices[dev_idx].num_worker[2] > 1) work_dim = 3; else if (region->devices[dev_idx].num_gang[1] > 1 || region->devices[dev_idx].num_worker[1] > 1) work_dim = 2; else if (region->devices[dev_idx].num_gang[0] > 1 || region->devices[dev_idx].num_worker[0] > 1) work_dim = 1; assert(work_dim > 0); size_t global_work_size[3] = { region->devices[dev_idx].num_gang[0] * region->devices[dev_idx].num_worker[0], region->devices[dev_idx].num_gang[1] * region->devices[dev_idx].num_worker[1], region->devices[dev_idx].num_gang[2] * region->devices[dev_idx].num_worker[2] }; size_t local_work_size[3] = { region->devices[dev_idx].num_worker[0], region->devices[dev_idx].num_worker[1], region->devices[dev_idx].num_worker[2] }; #if DBG_KERNEL printf("[debug] work_dim = %zd\n", work_dim); printf("[debug] global_work_size[3] = {%zd,%zd,%zd} (= %zd)\n", global_work_size[0], global_work_size[1], global_work_size[2], global_work_size[0] * global_work_size[1] * global_work_size[2]); printf("[debug] local_work_size[3] = {%zd,%zd,%zd} (= %zd)\n", local_work_size[0], local_work_size[1], local_work_size[2], local_work_size[0] * local_work_size[1] * local_work_size[2]); cl_ulong kernel_local_mem_size = 0; status = clGetKernelWorkGroupInfo(ocl_kernel, acc_runtime.opencl_data->devices[0][device_idx], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &kernel_local_mem_size, NULL); printf("[debug] kernel_local_mem_size = %lu\n", kernel_local_mem_size); size_t kernel_work_group_size = 0; status = clGetKernelWorkGroupInfo(ocl_kernel, acc_runtime.opencl_data->devices[0][device_idx], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_work_group_size, NULL); printf("[debug] kernel_work_group_size = %lu\n", kernel_work_group_size); #endif cl_event event; status = clEnqueueNDRangeKernel( acc_runtime.opencl_data->devices_data[device_idx]->command_queue, ocl_kernel, /* cl_uint work_dim = */ work_dim, /* const size_t * global_work_offset = */ NULL, /* const size_t * global_work_size = */ global_work_size, /* const size_t * local_work_size = */ local_work_size, /* cl_uint num_events_in_wait_list = */ 0, /* const cl_event * event_wait_list = */ NULL, /* cl_event * event = */ &event ); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clEnqueueNDRangeKernel return %s for region[%zd].kernel[%zd].\n", status_str, region->desc->id, kernel->desc->id ); assert(0); /// \todo error code } acc_profiling_register_kernel_launch(event, device_idx, region->desc->id, kernel->desc->id); clReleaseMemObject(ocl_context); clReleaseKernel(ocl_kernel); } }
void acc_region_init(struct acc_region_t_ * region) { #if DBG_REGION printf("[debug] acc_region_init\n"); #endif size_t region_id; for (region_id = 0; region_id < compiler_data.num_regions; region_id++) if (compiler_data.regions[region_id]->id == region->desc->id) break; assert(region_id < compiler_data.num_regions); char * ocl_sources[2] = { acc_runtime.opencl_data->runtime_sources, acc_runtime.opencl_data->region_sources[region_id] }; char build_options[1024]; build_options[0] = '\0'; strcpy(build_options, "-I"); strcat(build_options, compiler_data.acc_inc_path); strcat(build_options, " "); assert(compiler_data.regions[region_id]->num_options == 0 || compiler_data.regions[region_id]->options != NULL); unsigned i; for (i = 0; i < compiler_data.regions[region_id]->num_options; i++) { assert(strlen(build_options) < 512); strcat(build_options, compiler_data.regions[region_id]->options[i]); strcat(build_options, " "); } cl_int status; unsigned idx; for (idx = 0; idx < region->desc->num_devices; idx ++) { size_t device_idx = region->devices[idx].device_idx; assert(acc_runtime.opencl_data->devices_data[device_idx] != NULL); if (acc_runtime.opencl_data->devices_data[device_idx]->programs[region_id] == NULL) { cl_context * context = &(acc_runtime.opencl_data->devices_data[device_idx]->context); cl_program * program = &(acc_runtime.opencl_data->devices_data[device_idx]->programs[region_id]); *program = clCreateProgramWithSource(*context, 2, (const char **)ocl_sources, NULL, &status); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clCreateProgramWithSource on %s (#%u) for region %zd return %s : failed\n", acc_device_name[acc_runtime.curr_device_type], acc_runtime.curr_device_num, region_id, status_str); exit(-1); } status = clBuildProgram(*program, 1, &(acc_runtime.opencl_data->devices[0][device_idx]), build_options, NULL, NULL); if (status == CL_BUILD_PROGRAM_FAILURE) acc_dbg_ocl_build_log(device_idx, *program); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clBuildProgram on %s (#%u) for region %zd return %s\n", acc_device_name[acc_runtime.curr_device_type], acc_runtime.curr_device_num, region_id, status_str); exit(-1); } cl_build_status build_status; status = clGetProgramBuildInfo(*program, acc_runtime.opencl_data->devices[0][device_idx], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL); if (status != CL_SUCCESS) { const char * status_str = acc_ocl_status_to_char(status); printf("[fatal] clGetProgramBuildInfo return %s\n", status_str); exit(-1); /// \todo error code } switch (build_status) { case CL_BUILD_NONE: printf("[fatal] clGetProgramBuildInfo: build status: CL_BUILD_NONE.\n"); exit(-1); case CL_BUILD_ERROR: printf("[fatal] clGetProgramBuildInfo: build status: CL_BUILD_ERROR.\n"); acc_dbg_ocl_build_log(device_idx, *program); exit(-1); case CL_BUILD_SUCCESS: #if BUILD_LOG printf("[info] clGetProgramBuildInfo: build status: CL_BUILD_SUCCESS.\n"); acc_dbg_ocl_build_log(device_idx, *program); #endif break; case CL_BUILD_IN_PROGRESS: // Should not append as we do not provide a callback function to clBuildProgram printf("[fatal] clGetProgramBuildInfo: build status: CL_BUILD_IN_PROGRESS.\n"\ "[fatal] It should not append as we do not provide a callback function to clBuildProgram"); default: assert(0); } } } }