unsigned int pocl_hsa_probe(struct pocl_device_ops *ops) { int env_count = pocl_device_get_env_count(ops->device_name); POCL_MSG_PRINT_INFO("pocl-hsa: found %d env devices with %s.\n", env_count, ops->device_name); /* No hsa env specified, the user did not request for HSA agents. */ if (env_count <= 0) return 0; if (hsa_init() != HSA_STATUS_SUCCESS) { POCL_ABORT("pocl-hsa: hsa_init() failed."); } if (hsa_iterate_agents(pocl_hsa_get_agents, NULL) != HSA_STATUS_SUCCESS) { assert (0 && "pocl-hsa: could not get agents."); } POCL_MSG_PRINT_INFO("pocl-hsa: found %d agents.\n", found_hsa_agents); return found_hsa_agents; }
cl_int pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void* host_ptr) { void *b = NULL; cl_mem_flags flags = mem_obj->flags; unsigned i; POCL_MSG_PRINT_INFO("BASIC: alloc_mem_obj, mem %p, dev %d\n", mem_obj, device->dev_id); /* check if some driver has already allocated memory for this mem_obj in our global address space, and use that*/ for (i = 0; i < mem_obj->context->num_devices; ++i) { if (!mem_obj->device_ptrs[i].available) continue; if (mem_obj->device_ptrs[i].global_mem_id == device->global_mem_id && mem_obj->device_ptrs[i].mem_ptr != NULL) { mem_obj->device_ptrs[device->dev_id].mem_ptr = mem_obj->device_ptrs[i].mem_ptr; POCL_MSG_PRINT_INFO("BASIC: alloc_mem_obj %p dev %d, using already allocated mem\n", mem_obj, device->dev_id); return CL_SUCCESS; } } /* memory for this global memory is not yet allocated -> do it */ if (flags & CL_MEM_USE_HOST_PTR) { // mem_host_ptr must be non-NULL assert(host_ptr != NULL); b = host_ptr; } else { b = pocl_memalign_alloc_global_mem (device, MAX_EXTENDED_ALIGNMENT, mem_obj->size); if (b==NULL) return CL_MEM_OBJECT_ALLOCATION_FAILURE; mem_obj->shared_mem_allocation_owner = device; } /* use this dev mem allocation as host ptr */ if (flags & CL_MEM_ALLOC_HOST_PTR && (mem_obj->mem_host_ptr == NULL)) mem_obj->mem_host_ptr = b; if (flags & CL_MEM_COPY_HOST_PTR) { // mem_host_ptr must be non-NULL assert(host_ptr != NULL); memcpy (b, host_ptr, mem_obj->size); } mem_obj->device_ptrs[device->dev_id].mem_ptr = b; return CL_SUCCESS; }
/* serializes an entire pocl kernel cachedir. */ static unsigned char* serialize_kernel_cachedir(cl_kernel kernel, unsigned device_i, unsigned char* buffer) { cl_program program = kernel->program; char path[POCL_FILENAME_LENGTH]; char basedir[POCL_FILENAME_LENGTH]; pocl_cache_program_path(basedir, program, device_i); size_t basedir_len = strlen(basedir); pocl_cache_kernel_cachedir(path, program, device_i, kernel); POCL_MSG_PRINT_INFO("Kernel %s: recur serializing cachedir %s\n", kernel->name, path); buffer = recursively_serialize_path(path, basedir_len, buffer); return buffer; }
cl_int compile_and_link_program(int compile_program, int link_program, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, cl_uint num_input_headers, const cl_program *input_headers, const char **header_include_names, cl_uint num_input_programs, const cl_program *input_programs, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data) { char program_bc_path[POCL_FILENAME_LENGTH]; char link_options[512]; int errcode, error; int create_library = 0; int requires_cr_sqrt_div = 0; int spir_build = 0; unsigned flush_denorms = 0; uint64_t fsize; cl_device_id *unique_devlist = NULL; char *binary = NULL; unsigned device_i = 0, actually_built = 0; size_t i, j; char *temp_options = NULL; const char *extra_build_options = pocl_get_string_option ("POCL_EXTRA_BUILD_FLAGS", NULL); int build_error_code = (link_program ? CL_BUILD_PROGRAM_FAILURE : CL_COMPILE_PROGRAM_FAILURE); POCL_GOTO_LABEL_COND (PFN_NOTIFY, (program == NULL), CL_INVALID_PROGRAM); POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_devices > 0 && device_list == NULL), CL_INVALID_VALUE); POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_devices == 0 && device_list != NULL), CL_INVALID_VALUE); POCL_GOTO_LABEL_COND (PFN_NOTIFY, (pfn_notify == NULL && user_data != NULL), CL_INVALID_VALUE); POCL_GOTO_LABEL_ON (PFN_NOTIFY, program->kernels, CL_INVALID_OPERATION, "Program already has kernels\n"); POCL_GOTO_LABEL_ON (PFN_NOTIFY, (program->source == NULL && program->binaries == NULL), CL_INVALID_PROGRAM, "Program doesn't have sources or binaries! You need " "to call clCreateProgramWith{Binary|Source} first\n"); POCL_GOTO_LABEL_ON (PFN_NOTIFY, ((program->source == NULL) && (link_program == 0)), CL_INVALID_OPERATION, "Cannot clCompileProgram when program has no source\n"); POCL_LOCK_OBJ (program); program->main_build_log[0] = 0; /* TODO this should be somehow utilized at linking */ POCL_MEM_FREE (program->compiler_options); if (extra_build_options) { size_t len = (options != NULL) ? strlen (options) : 0; len += strlen (extra_build_options) + 2; temp_options = (char *)malloc (len); temp_options[0] = 0; if (options != NULL) { strcpy (temp_options, options); strcat (temp_options, " "); } strcat (temp_options, extra_build_options); } else temp_options = (char*) options; if (temp_options) { i = strlen (temp_options); size_t size = i + 512; /* add some space for pocl-added options */ program->compiler_options = (char *)malloc (size); errcode = process_options (temp_options, program->compiler_options, link_options, program, compile_program, link_program, &create_library, &flush_denorms, &requires_cr_sqrt_div, &spir_build, size); if (errcode != CL_SUCCESS) goto ERROR_CLEAN_OPTIONS; } POCL_MSG_PRINT_LLVM ("building program with options %s\n", program->compiler_options); program->flush_denorms = flush_denorms; #if !(defined(__x86_64__) && defined(__GNUC__)) if (flush_denorms) { POCL_MSG_WARN ("flush to zero is currently only implemented for " "x86-64 & gcc/clang, ignoring flag\n"); } #endif /* DEVICE LIST */ if (num_devices == 0) { num_devices = program->num_devices; device_list = program->devices; } else { // convert subdevices to devices and remove duplicates cl_uint real_num_devices = 0; unique_devlist = pocl_unique_device_list (device_list, num_devices, &real_num_devices); num_devices = real_num_devices; device_list = unique_devlist; } clean_program_on_rebuild (program); /* Build the fully linked non-parallel bitcode for all devices. */ for (device_i = 0; device_i < program->num_devices; ++device_i) { cl_device_id device = program->devices[device_i]; /* find the device in the supplied devices-to-build-for list */ int found = 0; for (i = 0; i < num_devices; ++i) if (device_list[i] == device) found = 1; if (!found) continue; if (requires_cr_sqrt_div && !(device->single_fp_config & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)) { APPEND_TO_MAIN_BUILD_LOG (REQUIRES_CR_SQRT_DIV_ERR); POCL_GOTO_ERROR_ON (1, build_error_code, REQUIRES_CR_SQRT_DIV_ERR " %s\n", device->short_name); } actually_built++; /* clCreateProgramWithSource */ if (program->source) { #ifdef OCS_AVAILABLE if (device->compiler_available == CL_TRUE) { POCL_MSG_PRINT_INFO ("building from sources for device %d\n", device_i); error = pocl_llvm_build_program ( program, device_i, program->compiler_options, program_bc_path, num_input_headers, input_headers, header_include_names, (create_library ? 0 : link_program)); POCL_GOTO_ERROR_ON ((error != 0), build_error_code, "pocl_llvm_build_program() failed\n"); } else #endif { APPEND_TO_MAIN_BUILD_LOG ( "Cannot build a program from sources with pocl " "that does not have online compiler support\n"); POCL_GOTO_ERROR_ON (1, CL_COMPILER_NOT_AVAILABLE, "%s", program->main_build_log); } } /* clCreateProgramWithBinaries */ else if (program->binaries[device_i] && (program->pocl_binaries[device_i] == NULL)) { #ifdef OCS_AVAILABLE /* bitcode is now either plain LLVM IR or SPIR IR */ int spir_binary = bitcode_is_spir ((char*)program->binaries[device_i], program->binary_sizes[device_i]); if (spir_binary) POCL_MSG_PRINT_LLVM ("LLVM-SPIR binary detected\n"); else POCL_MSG_PRINT_LLVM ("building from a BC binary for device %d\n", device_i); if (spir_binary) { #ifdef ENABLE_SPIR if (!strstr (device->extensions, "cl_khr_spir")) { APPEND_TO_MAIN_BUILD_LOG (REQUIRES_SPIR_SUPPORT); POCL_GOTO_ERROR_ON (1, build_error_code, REQUIRES_SPIR_SUPPORT " %s\n", device->short_name); } if (!spir_build) POCL_MSG_WARN ( "SPIR binary provided, but no spir in build options\n"); /* SPIR binaries need to be explicitly linked to the kernel * library. for non-SPIR binaries this happens as part of build * process when program.bc is generated. */ error = pocl_llvm_link_program (program, device_i, program_bc_path, 0, NULL, NULL, NULL, 0, 1); POCL_GOTO_ERROR_ON (error, CL_LINK_PROGRAM_FAILURE, "Failed to link SPIR program.bc\n"); #else APPEND_TO_MAIN_BUILD_LOG (REQUIRES_SPIR_SUPPORT); POCL_GOTO_ERROR_ON (1, build_error_code, REQUIRES_SPIR_SUPPORT " %s\n", device->short_name); #endif } #else APPEND_TO_MAIN_BUILD_LOG ( "Cannot build program from LLVM IR binaries with " "pocl that does not have online compiler support\n"); POCL_GOTO_ERROR_ON (1, CL_COMPILER_NOT_AVAILABLE, "%s", program->main_build_log); #endif } else if (program->pocl_binaries[device_i]) { POCL_MSG_PRINT_INFO("having a poclbinary for device %d\n", device_i); #ifdef OCS_AVAILABLE if (program->binaries[device_i] == NULL) { POCL_MSG_WARN ( "pocl-binary for this device doesn't contain " "program.bc - you won't be able to rebuild/link it\n"); /* do not try to read program.bc or LLVM IRs * TODO maybe read LLVM IRs ?*/ continue; } #else continue; #endif } else if (link_program && (num_input_programs > 0)) { #ifdef OCS_AVAILABLE /* just link binaries. */ unsigned char *cur_device_binaries[num_input_programs]; size_t cur_device_binary_sizes[num_input_programs]; void *cur_llvm_irs[num_input_programs]; for (j = 0; j < num_input_programs; j++) { assert (device == input_programs[j]->devices[device_i]); cur_device_binaries[j] = input_programs[j]->binaries[device_i]; assert (cur_device_binaries[j]); cur_device_binary_sizes[j] = input_programs[j]->binary_sizes[device_i]; if (input_programs[j]->llvm_irs[device_i] == NULL) pocl_update_program_llvm_irs (input_programs[j], device_i); cur_llvm_irs[j] = input_programs[j]->llvm_irs[device_i]; assert (cur_llvm_irs[j]); } error = pocl_llvm_link_program ( program, device_i, program_bc_path, num_input_programs, cur_device_binaries, cur_device_binary_sizes, cur_llvm_irs, create_library, 0); POCL_GOTO_ERROR_ON ((error != CL_SUCCESS), CL_LINK_PROGRAM_FAILURE, "pocl_llvm_link_program() failed\n"); #else POCL_GOTO_ERROR_ON ((1), CL_LINK_PROGRAM_FAILURE, "clCompileProgram/clLinkProgram/clBuildProgram" " require a pocl built with LLVM\n"); #endif } else { POCL_GOTO_ERROR_ON (1, CL_INVALID_BINARY, "No sources nor binaries for device %s - can't " "build the program\n", device->short_name); } #ifdef OCS_AVAILABLE /* Read binaries from program.bc to memory */ if (program->binaries[device_i] == NULL) { errcode = pocl_read_file(program_bc_path, &binary, &fsize); POCL_GOTO_ERROR_ON(errcode, CL_BUILD_ERROR, "Failed to read binaries from program.bc to " "memory: %s\n", program_bc_path); program->binary_sizes[device_i] = (size_t)fsize; program->binaries[device_i] = (unsigned char *)binary; } if (program->llvm_irs[device_i] == NULL) { pocl_update_program_llvm_irs(program, device_i); } /* Maintain a 'last_accessed' file in every program's * cache directory. Will be useful for cache pruning script * that flushes old directories based on LRU */ pocl_cache_update_program_last_access(program, device_i); #endif } POCL_GOTO_ERROR_ON ((actually_built < num_devices), build_error_code, "Some of the devices on the argument-supplied list are" "not available for the program, or do not exist\n"); program->build_status = CL_BUILD_SUCCESS; program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE; /* if program will be compiled using clCompileProgram its binary_type * will be set to CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT. * * if program was created by clLinkProgram which is called * with the –createlibrary link option its binary_type will be set to * CL_PROGRAM_BINARY_TYPE_LIBRARY. */ if (create_library) program->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY; if (compile_program && !link_program) program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT; assert(program->num_kernels == 0); /* get non-device-specific kernel metadata. We can stop after finding * the first method that works.*/ for (device_i = 0; device_i < program->num_devices; device_i++) { #ifdef OCS_AVAILABLE if (program->binaries[device_i]) { program->num_kernels = pocl_llvm_get_kernel_count (program, device_i); if (program->num_kernels) { program->kernel_meta = calloc (program->num_kernels, sizeof (pocl_kernel_metadata_t)); pocl_llvm_get_kernels_metadata (program, device_i); } break; } #endif if (program->pocl_binaries[device_i]) { program->num_kernels = pocl_binary_get_kernel_count (program, device_i); if (program->num_kernels) { program->kernel_meta = calloc (program->num_kernels, sizeof (pocl_kernel_metadata_t)); pocl_binary_get_kernels_metadata (program, device_i); } break; } } POCL_GOTO_ERROR_ON ((device_i >= program->num_devices), CL_INVALID_BINARY, "Could find kernel metadata in the built program\n"); /* calculate device-specific kernel hashes. */ for (j = 0; j < program->num_kernels; ++j) { program->kernel_meta[j].build_hash = calloc (program->num_devices, sizeof (pocl_kernel_hash_t)); for (device_i = 0; device_i < program->num_devices; device_i++) { pocl_calculate_kernel_hash (program, j, device_i); } } errcode = CL_SUCCESS; goto FINISH; ERROR: free_meta (program); program->kernels = NULL; for (device_i = 0; device_i < program->num_devices; device_i++) { if (program->source) { POCL_MEM_FREE (program->binaries[device_i]); program->binary_sizes[device_i] = 0; } } ERROR_CLEAN_OPTIONS: if (temp_options != options) free (temp_options); program->build_status = CL_BUILD_ERROR; FINISH: POCL_UNLOCK_OBJ (program); POCL_MEM_FREE (unique_devlist); PFN_NOTIFY: if (pfn_notify) pfn_notify (program, user_data); return errcode; }
void pocl_cuda_init (cl_device_id device, const char *parameters) { CUresult result; result = cuInit (0); CUDA_CHECK (result, "cuInit"); if (device->data) return; pocl_cuda_device_data_t *data = malloc (sizeof (pocl_cuda_device_data_t)); result = cuDeviceGet (&data->device, 0); CUDA_CHECK (result, "cuDeviceGet"); // Get specific device name device->long_name = device->short_name = malloc (256 * sizeof (char)); cuDeviceGetName (device->long_name, 256, data->device); // Get other device properties cuDeviceGetAttribute ((int *)&device->max_work_group_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, data->device); cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 0), CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, data->device); cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 1), CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, data->device); cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 2), CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, data->device); cuDeviceGetAttribute ( (int *)&device->local_mem_size, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, data->device); cuDeviceGetAttribute ((int *)&device->max_compute_units, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, data->device); cuDeviceGetAttribute ((int *)&device->max_clock_frequency, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, data->device); cuDeviceGetAttribute ((int *)&device->error_correction_support, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, data->device); cuDeviceGetAttribute ((int *)&device->host_unified_memory, CU_DEVICE_ATTRIBUTE_INTEGRATED, data->device); cuDeviceGetAttribute ((int *)&device->max_constant_buffer_size, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, data->device); device->preferred_vector_width_char = 1; device->preferred_vector_width_short = 1; device->preferred_vector_width_int = 1; device->preferred_vector_width_long = 1; device->preferred_vector_width_float = 1; device->preferred_vector_width_double = 1; device->preferred_vector_width_half = 0; device->native_vector_width_char = 1; device->native_vector_width_short = 1; device->native_vector_width_int = 1; device->native_vector_width_long = 1; device->native_vector_width_float = 1; device->native_vector_width_double = 1; device->native_vector_width_half = 0; device->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN | CL_FP_DENORM; device->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN | CL_FP_DENORM; device->local_mem_type = CL_LOCAL; device->host_unified_memory = 0; // Get GPU architecture name int sm_maj, sm_min; cuDeviceGetAttribute (&sm_maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, data->device); cuDeviceGetAttribute (&sm_min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, data->device); char *gpu_arch = malloc (16 * sizeof (char)); snprintf (gpu_arch, 16, "sm_%d%d", sm_maj, sm_min); device->llvm_cpu = pocl_get_string_option ("POCL_CUDA_GPU_ARCH", gpu_arch); POCL_MSG_PRINT_INFO ("[CUDA] GPU architecture = %s\n", device->llvm_cpu); // Create context result = cuCtxCreate (&data->context, CU_CTX_MAP_HOST, data->device); CUDA_CHECK (result, "cuCtxCreate"); // Get global memory size size_t memfree, memtotal; result = cuMemGetInfo (&memfree, &memtotal); device->max_mem_alloc_size = max (memtotal / 4, 128 * 1024 * 1024); device->global_mem_size = memtotal; device->data = data; }