void pocl_cuda_uninit (cl_device_id device) { pocl_cuda_device_data_t *data = device->data; cuCtxDestroy (data->context); POCL_MEM_FREE (data); device->data = NULL; POCL_MEM_FREE (device->long_name); }
void pocl_hsa_uninit (cl_device_id device) { struct data *d = (struct data*)device->data; POCL_MEM_FREE(d); device->data = NULL; }
void pocl_remove_file (const char *file_path) { int str_size = 10 + strlen(file_path) + 1; char *cmd = (char*)malloc(str_size); snprintf(cmd, str_size, "rm -f '%s'", file_path); system(cmd); POCL_MEM_FREE(cmd); }
void pocl_remove_directory (const char *path_name) { int str_size = 10 + strlen(path_name) + 1; char *cmd = (char*)malloc(str_size); snprintf(cmd, str_size, "rm -fr '%s'", path_name); system(cmd); POCL_MEM_FREE(cmd); }
void pocl_make_directory (const char *path_name) { int str_size = 12 + strlen(path_name) + 1; char *cmd = (char*)malloc(str_size); snprintf(cmd, str_size, "mkdir -p '%s'", path_name); system(cmd); POCL_MEM_FREE(cmd); }
static void clean_program_on_rebuild (cl_program program) { /* if we're rebuilding the program, release the kernels and reset log/status */ size_t i; if ((program->build_status != CL_BUILD_NONE) || program->num_kernels > 0) { /* Spec says: CL_INVALID_OPERATION if there are kernel objects attached to program. ...and we check for that earlier. */ assert (program->kernels == NULL); free_meta (program); program->num_kernels = 0; program->build_status = CL_BUILD_NONE; for (i = 0; i < program->num_devices; ++i) { POCL_MEM_FREE (program->build_log[i]); memset (program->build_hash[i], 0, sizeof (SHA1_digest_t)); if (program->source) { POCL_MEM_FREE (program->binaries[i]); program->binary_sizes[i] = 0; #ifdef OCS_AVAILABLE if (program->llvm_irs[i]) pocl_free_llvm_irs (program, i); #endif POCL_MEM_FREE (program->pocl_binaries[i]); program->pocl_binary_sizes[i] = 0; } } program->main_build_log[0] = 0; } }
int pocl_cache_create_program_cachedir(cl_program program, unsigned device_i, const char* preprocessed_source, size_t source_len, char* program_bc_path, void** cache_lock) { const char *hash_source = NULL; uint8_t old_build_hash[SHA1_DIGEST_SIZE] = {0}; size_t hs_len = 0; assert(cache_topdir_initialized); if (program->source && preprocessed_source==NULL) { hash_source = program->source; hs_len = strlen(program->source); } else { hash_source = preprocessed_source; hs_len = source_len; } if (program->build_hash[device_i]) memcpy(old_build_hash, program->build_hash[device_i], SHA1_DIGEST_SIZE); build_program_compute_hash(program, device_i, hash_source, hs_len); /* if the old hash is nonzero and different, we must free the built binaries before returning, so that they get loaded from the new location */ if (old_build_hash[0] && memcmp(old_build_hash, program->build_hash[device_i], SHA1_DIGEST_SIZE)) { if (program->binaries[device_i]) { POCL_MEM_FREE(program->binaries[device_i]); program->binary_sizes[device_i] = 0; } pocl_free_llvm_irs(program, device_i); } program_device_dir(program_bc_path, program, device_i, ""); if (pocl_mkdir_p(program_bc_path)) return 1; pocl_cache_program_bc_path(program_bc_path, program, device_i); *cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i); return 0; }
/** * Get the number of specified devices from environnement */ int pocl_device_get_env_count(const char *dev_type) { const char *dev_env = getenv(POCL_DEVICES_ENV); char *ptr, *saveptr, *tofree, *token; unsigned int dev_count = 0; if (dev_env == NULL) { return -1; } ptr = tofree = strdup(dev_env); while ((token = strtok_r (ptr, " ", &saveptr)) != NULL) { if(strcmp(token, dev_type) == 0) dev_count++; ptr = NULL; } POCL_MEM_FREE(tofree); return dev_count; }
static void free_meta (cl_program program) { size_t i; unsigned j; if (program->num_kernels) { for (i = 0; i < program->num_kernels; i++) { pocl_kernel_metadata_t *meta = &program->kernel_meta[i]; POCL_MEM_FREE (meta->attributes); POCL_MEM_FREE (meta->name); POCL_MEM_FREE (meta->arg_info); for (j = 0; j < program->num_devices; ++j) if (meta->data[j] != NULL) meta->data[j] = NULL; // TODO free data in driver callback POCL_MEM_FREE (meta->data); POCL_MEM_FREE (meta->local_sizes); POCL_MEM_FREE (meta->build_hash); } POCL_MEM_FREE (program->kernel_meta); } }
cl_int compile_and_link_program(int compile_program, int link_program, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, cl_uint num_input_headers, const cl_program *input_headers, const char **header_include_names, cl_uint num_input_programs, const cl_program *input_programs, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data) { char program_bc_path[POCL_FILENAME_LENGTH]; char link_options[512]; int errcode, error; int create_library = 0; int requires_cr_sqrt_div = 0; int spir_build = 0; unsigned flush_denorms = 0; uint64_t fsize; cl_device_id *unique_devlist = NULL; char *binary = NULL; unsigned device_i = 0, actually_built = 0; size_t i, j; char *temp_options = NULL; const char *extra_build_options = pocl_get_string_option ("POCL_EXTRA_BUILD_FLAGS", NULL); int build_error_code = (link_program ? CL_BUILD_PROGRAM_FAILURE : CL_COMPILE_PROGRAM_FAILURE); POCL_GOTO_LABEL_COND (PFN_NOTIFY, (program == NULL), CL_INVALID_PROGRAM); POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_devices > 0 && device_list == NULL), CL_INVALID_VALUE); POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_devices == 0 && device_list != NULL), CL_INVALID_VALUE); POCL_GOTO_LABEL_COND (PFN_NOTIFY, (pfn_notify == NULL && user_data != NULL), CL_INVALID_VALUE); POCL_GOTO_LABEL_ON (PFN_NOTIFY, program->kernels, CL_INVALID_OPERATION, "Program already has kernels\n"); POCL_GOTO_LABEL_ON (PFN_NOTIFY, (program->source == NULL && program->binaries == NULL), CL_INVALID_PROGRAM, "Program doesn't have sources or binaries! You need " "to call clCreateProgramWith{Binary|Source} first\n"); POCL_GOTO_LABEL_ON (PFN_NOTIFY, ((program->source == NULL) && (link_program == 0)), CL_INVALID_OPERATION, "Cannot clCompileProgram when program has no source\n"); POCL_LOCK_OBJ (program); program->main_build_log[0] = 0; /* TODO this should be somehow utilized at linking */ POCL_MEM_FREE (program->compiler_options); if (extra_build_options) { size_t len = (options != NULL) ? strlen (options) : 0; len += strlen (extra_build_options) + 2; temp_options = (char *)malloc (len); temp_options[0] = 0; if (options != NULL) { strcpy (temp_options, options); strcat (temp_options, " "); } strcat (temp_options, extra_build_options); } else temp_options = (char*) options; if (temp_options) { i = strlen (temp_options); size_t size = i + 512; /* add some space for pocl-added options */ program->compiler_options = (char *)malloc (size); errcode = process_options (temp_options, program->compiler_options, link_options, program, compile_program, link_program, &create_library, &flush_denorms, &requires_cr_sqrt_div, &spir_build, size); if (errcode != CL_SUCCESS) goto ERROR_CLEAN_OPTIONS; } POCL_MSG_PRINT_LLVM ("building program with options %s\n", program->compiler_options); program->flush_denorms = flush_denorms; #if !(defined(__x86_64__) && defined(__GNUC__)) if (flush_denorms) { POCL_MSG_WARN ("flush to zero is currently only implemented for " "x86-64 & gcc/clang, ignoring flag\n"); } #endif /* DEVICE LIST */ if (num_devices == 0) { num_devices = program->num_devices; device_list = program->devices; } else { // convert subdevices to devices and remove duplicates cl_uint real_num_devices = 0; unique_devlist = pocl_unique_device_list (device_list, num_devices, &real_num_devices); num_devices = real_num_devices; device_list = unique_devlist; } clean_program_on_rebuild (program); /* Build the fully linked non-parallel bitcode for all devices. */ for (device_i = 0; device_i < program->num_devices; ++device_i) { cl_device_id device = program->devices[device_i]; /* find the device in the supplied devices-to-build-for list */ int found = 0; for (i = 0; i < num_devices; ++i) if (device_list[i] == device) found = 1; if (!found) continue; if (requires_cr_sqrt_div && !(device->single_fp_config & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)) { APPEND_TO_MAIN_BUILD_LOG (REQUIRES_CR_SQRT_DIV_ERR); POCL_GOTO_ERROR_ON (1, build_error_code, REQUIRES_CR_SQRT_DIV_ERR " %s\n", device->short_name); } actually_built++; /* clCreateProgramWithSource */ if (program->source) { #ifdef OCS_AVAILABLE if (device->compiler_available == CL_TRUE) { POCL_MSG_PRINT_INFO ("building from sources for device %d\n", device_i); error = pocl_llvm_build_program ( program, device_i, program->compiler_options, program_bc_path, num_input_headers, input_headers, header_include_names, (create_library ? 0 : link_program)); POCL_GOTO_ERROR_ON ((error != 0), build_error_code, "pocl_llvm_build_program() failed\n"); } else #endif { APPEND_TO_MAIN_BUILD_LOG ( "Cannot build a program from sources with pocl " "that does not have online compiler support\n"); POCL_GOTO_ERROR_ON (1, CL_COMPILER_NOT_AVAILABLE, "%s", program->main_build_log); } } /* clCreateProgramWithBinaries */ else if (program->binaries[device_i] && (program->pocl_binaries[device_i] == NULL)) { #ifdef OCS_AVAILABLE /* bitcode is now either plain LLVM IR or SPIR IR */ int spir_binary = bitcode_is_spir ((char*)program->binaries[device_i], program->binary_sizes[device_i]); if (spir_binary) POCL_MSG_PRINT_LLVM ("LLVM-SPIR binary detected\n"); else POCL_MSG_PRINT_LLVM ("building from a BC binary for device %d\n", device_i); if (spir_binary) { #ifdef ENABLE_SPIR if (!strstr (device->extensions, "cl_khr_spir")) { APPEND_TO_MAIN_BUILD_LOG (REQUIRES_SPIR_SUPPORT); POCL_GOTO_ERROR_ON (1, build_error_code, REQUIRES_SPIR_SUPPORT " %s\n", device->short_name); } if (!spir_build) POCL_MSG_WARN ( "SPIR binary provided, but no spir in build options\n"); /* SPIR binaries need to be explicitly linked to the kernel * library. for non-SPIR binaries this happens as part of build * process when program.bc is generated. */ error = pocl_llvm_link_program (program, device_i, program_bc_path, 0, NULL, NULL, NULL, 0, 1); POCL_GOTO_ERROR_ON (error, CL_LINK_PROGRAM_FAILURE, "Failed to link SPIR program.bc\n"); #else APPEND_TO_MAIN_BUILD_LOG (REQUIRES_SPIR_SUPPORT); POCL_GOTO_ERROR_ON (1, build_error_code, REQUIRES_SPIR_SUPPORT " %s\n", device->short_name); #endif } #else APPEND_TO_MAIN_BUILD_LOG ( "Cannot build program from LLVM IR binaries with " "pocl that does not have online compiler support\n"); POCL_GOTO_ERROR_ON (1, CL_COMPILER_NOT_AVAILABLE, "%s", program->main_build_log); #endif } else if (program->pocl_binaries[device_i]) { POCL_MSG_PRINT_INFO("having a poclbinary for device %d\n", device_i); #ifdef OCS_AVAILABLE if (program->binaries[device_i] == NULL) { POCL_MSG_WARN ( "pocl-binary for this device doesn't contain " "program.bc - you won't be able to rebuild/link it\n"); /* do not try to read program.bc or LLVM IRs * TODO maybe read LLVM IRs ?*/ continue; } #else continue; #endif } else if (link_program && (num_input_programs > 0)) { #ifdef OCS_AVAILABLE /* just link binaries. */ unsigned char *cur_device_binaries[num_input_programs]; size_t cur_device_binary_sizes[num_input_programs]; void *cur_llvm_irs[num_input_programs]; for (j = 0; j < num_input_programs; j++) { assert (device == input_programs[j]->devices[device_i]); cur_device_binaries[j] = input_programs[j]->binaries[device_i]; assert (cur_device_binaries[j]); cur_device_binary_sizes[j] = input_programs[j]->binary_sizes[device_i]; if (input_programs[j]->llvm_irs[device_i] == NULL) pocl_update_program_llvm_irs (input_programs[j], device_i); cur_llvm_irs[j] = input_programs[j]->llvm_irs[device_i]; assert (cur_llvm_irs[j]); } error = pocl_llvm_link_program ( program, device_i, program_bc_path, num_input_programs, cur_device_binaries, cur_device_binary_sizes, cur_llvm_irs, create_library, 0); POCL_GOTO_ERROR_ON ((error != CL_SUCCESS), CL_LINK_PROGRAM_FAILURE, "pocl_llvm_link_program() failed\n"); #else POCL_GOTO_ERROR_ON ((1), CL_LINK_PROGRAM_FAILURE, "clCompileProgram/clLinkProgram/clBuildProgram" " require a pocl built with LLVM\n"); #endif } else { POCL_GOTO_ERROR_ON (1, CL_INVALID_BINARY, "No sources nor binaries for device %s - can't " "build the program\n", device->short_name); } #ifdef OCS_AVAILABLE /* Read binaries from program.bc to memory */ if (program->binaries[device_i] == NULL) { errcode = pocl_read_file(program_bc_path, &binary, &fsize); POCL_GOTO_ERROR_ON(errcode, CL_BUILD_ERROR, "Failed to read binaries from program.bc to " "memory: %s\n", program_bc_path); program->binary_sizes[device_i] = (size_t)fsize; program->binaries[device_i] = (unsigned char *)binary; } if (program->llvm_irs[device_i] == NULL) { pocl_update_program_llvm_irs(program, device_i); } /* Maintain a 'last_accessed' file in every program's * cache directory. Will be useful for cache pruning script * that flushes old directories based on LRU */ pocl_cache_update_program_last_access(program, device_i); #endif } POCL_GOTO_ERROR_ON ((actually_built < num_devices), build_error_code, "Some of the devices on the argument-supplied list are" "not available for the program, or do not exist\n"); program->build_status = CL_BUILD_SUCCESS; program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE; /* if program will be compiled using clCompileProgram its binary_type * will be set to CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT. * * if program was created by clLinkProgram which is called * with the –createlibrary link option its binary_type will be set to * CL_PROGRAM_BINARY_TYPE_LIBRARY. */ if (create_library) program->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY; if (compile_program && !link_program) program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT; assert(program->num_kernels == 0); /* get non-device-specific kernel metadata. We can stop after finding * the first method that works.*/ for (device_i = 0; device_i < program->num_devices; device_i++) { #ifdef OCS_AVAILABLE if (program->binaries[device_i]) { program->num_kernels = pocl_llvm_get_kernel_count (program, device_i); if (program->num_kernels) { program->kernel_meta = calloc (program->num_kernels, sizeof (pocl_kernel_metadata_t)); pocl_llvm_get_kernels_metadata (program, device_i); } break; } #endif if (program->pocl_binaries[device_i]) { program->num_kernels = pocl_binary_get_kernel_count (program, device_i); if (program->num_kernels) { program->kernel_meta = calloc (program->num_kernels, sizeof (pocl_kernel_metadata_t)); pocl_binary_get_kernels_metadata (program, device_i); } break; } } POCL_GOTO_ERROR_ON ((device_i >= program->num_devices), CL_INVALID_BINARY, "Could find kernel metadata in the built program\n"); /* calculate device-specific kernel hashes. */ for (j = 0; j < program->num_kernels; ++j) { program->kernel_meta[j].build_hash = calloc (program->num_devices, sizeof (pocl_kernel_hash_t)); for (device_i = 0; device_i < program->num_devices; device_i++) { pocl_calculate_kernel_hash (program, j, device_i); } } errcode = CL_SUCCESS; goto FINISH; ERROR: free_meta (program); program->kernels = NULL; for (device_i = 0; device_i < program->num_devices; device_i++) { if (program->source) { POCL_MEM_FREE (program->binaries[device_i]); program->binary_sizes[device_i] = 0; } } ERROR_CLEAN_OPTIONS: if (temp_options != options) free (temp_options); program->build_status = CL_BUILD_ERROR; FINISH: POCL_UNLOCK_OBJ (program); POCL_MEM_FREE (unique_devlist); PFN_NOTIFY: if (pfn_notify) pfn_notify (program, user_data); return errcode; }
int pocl_topology_detect_device_info (cl_device_id device) { device->global_mem_cacheline_size = HOST_CPU_CACHELINE_SIZE; device->global_mem_cache_type = 0x2; // CL_READ_WRITE_CACHE, without including all of CL/cl.h /* global mem cache size */ char *content; uint64_t filesize; if (pocl_read_file (L3_CACHE_SIZE, &content, &filesize) == 0) { long val = atol (content); device->global_mem_cache_size = val * 1024; POCL_MEM_FREE (content); } else { if (pocl_read_file (L2_CACHE_SIZE, &content, &filesize) == 0) { long val = atol (content); device->global_mem_cache_size = val * 1024; POCL_MEM_FREE (content); } else { POCL_MSG_WARN ( "Could not figure out CPU cache size, using bogus value\n"); device->global_mem_cache_size = 1 << 20; } } /* global_mem_size */ if (pocl_read_file (MEMINFO, &content, &filesize) == 0) { printf ("content 11: %s FSIZE: %lu \n", content, filesize); char *tmp = content; unsigned long memsize_kb; size_t i; while (*tmp && (*tmp != '\n')) ++tmp; *tmp = 0; printf ("content: %s \n", content); tmp = content; while (*tmp && (*tmp != 0x20)) ++tmp; while (*tmp && (*tmp == 0x20)) ++tmp; printf ("TMP: %s \n", tmp); int items = sscanf (tmp, "%lu kB", &memsize_kb); printf ("MEMSIZE: %lu ITEMS: %i\n", memsize_kb, items); assert (items == 1); device->global_mem_size = memsize_kb * 1024; POCL_MEM_FREE (content); } else { POCL_MSG_WARN ("Cannot get memory size\n"); device->global_mem_size = 256 << 20; } /* max_compute_units */ if (pocl_read_file (CPUS, &content, &filesize) == 0) { long start, end; int items = sscanf (content, "%lu-%lu", &start, &end); assert (items == 2); device->max_compute_units = (unsigned)end + 1; POCL_MEM_FREE (content); } else { POCL_MSG_WARN ("Cannot get logical CPU number\n"); device->max_compute_units = 1; } return 0; }
/* options must be non-NULL. * modded_options[size] + link_options are preallocated outputs */ static cl_int process_options (const char *options, char *modded_options, char *link_options, cl_program program, int compiling, int linking, int *create_library, unsigned *flush_denorms, int *requires_correctly_rounded_sqrt_div, int *spir_build, size_t size) { cl_int error; char *token = NULL; char *saveptr = NULL; *create_library = 0; *flush_denorms = 0; *requires_correctly_rounded_sqrt_div = 0; *spir_build = 0; int enable_link_options = 0; link_options[0] = 0; modded_options[0] = 0; int ret_error = (linking ? (compiling ? CL_INVALID_BUILD_OPTIONS : CL_INVALID_LINKER_OPTIONS) : CL_INVALID_COMPILER_OPTIONS); assert (options); assert (modded_options); assert (compiling || linking); size_t i = 1; /* terminating char */ size_t needed = 0; char *temp_options = (char*) malloc (strlen (options) + 1); strcpy (temp_options, options); token = strtok_r (temp_options, " ", &saveptr); while (token != NULL) { /* check if parameter is supported compiler parameter */ if (strncmp (token, "-cl", 3) == 0 || strncmp (token, "-w", 2) == 0 || strncmp (token, "-Werror", 7) == 0) { if (strstr (cl_program_link_options, token)) { /* when linking, only a subset of -cl* options are valid, * and only with -enable-link-options */ if (linking && (!compiling)) { if (!enable_link_options) { APPEND_TO_MAIN_BUILD_LOG ( "Not compiling but link options were not enabled, " "therefore %s is an invalid option\n", token); error = ret_error; goto ERROR; } strcat (link_options, token); } if (strstr (token, "-cl-denorms-are-zero")) { *flush_denorms = 1; } if (strstr (token, "-cl-fp32-correctly-rounded-divide-sqrt")) { *requires_correctly_rounded_sqrt_div = 1; } } if (strstr (cl_parameters, token)) { /* the LLVM API call pushes the parameters directly to the frontend without using -Xclang */ } else if (strstr (cl_parameters_supported_after_clang_3_9, token)) { #ifndef LLVM_OLDER_THAN_3_9 /* the LLVM API call pushes the parameters directly to the * frontend without using -Xclang*/ #else APPEND_TO_MAIN_BUILD_LOG ( "This build option is supported after clang3.9: %s\n", token); token = strtok_r (NULL, " ", &saveptr); continue; #endif } else if (strstr (cl_parameters_not_yet_supported_by_clang, token)) { APPEND_TO_MAIN_BUILD_LOG ( "This build option is not yet supported by clang: %s\n", token); token = strtok_r (NULL, " ", &saveptr); continue; } else { APPEND_TO_MAIN_BUILD_LOG("Invalid build option: %s\n", token); error = ret_error; goto ERROR; } } else if (strncmp (token, "-g", 2) == 0) { #ifndef LLVM_OLDER_THAN_3_8 token = "-dwarf-column-info -debug-info-kind=limited " \ "-dwarf-version=4 -debugger-tuning=gdb"; #endif } else if (strncmp (token, "-D", 2) == 0 || strncmp (token, "-I", 2) == 0) { APPEND_TOKEN(); /* if there is a space in between, then next token is part of the option */ if (strlen (token) == 2) token = strtok_r (NULL, " ", &saveptr); else { token = strtok_r (NULL, " ", &saveptr); continue; } } else if (strncmp (token, "-x", 2) == 0 && strlen (token) == 2) { /* only "-x spir" is valid for the "-x" option */ token = strtok_r (NULL, " ", &saveptr); if (!token || strncmp (token, "spir", 4) != 0) { APPEND_TO_MAIN_BUILD_LOG ( "Invalid parameter to -x build option\n"); error = ret_error; goto ERROR; } /* "-x spir" is not valid if we are building from source */ else if (program->source) { APPEND_TO_MAIN_BUILD_LOG ( "\"-x spir\" is not valid when building from source\n"); error = ret_error; goto ERROR; } else *spir_build = 1; token = strtok_r (NULL, " ", &saveptr); continue; } else if (strncmp (token, "-spir-std=1.2", 13) == 0) { /* "-spir-std=" flags are not valid when building from source */ if (program->source) { APPEND_TO_MAIN_BUILD_LOG ("\"-spir-std=\" flag is not valid " "when building from source\n"); error = ret_error; goto ERROR; } else *spir_build = 1; token = strtok_r (NULL, " ", &saveptr); continue; } else if (strncmp (token, "-create-library", 15) == 0) { if (!linking) { APPEND_TO_MAIN_BUILD_LOG ( "\"-create-library\" flag is only valid when linking\n"); error = ret_error; goto ERROR; } *create_library = 1; token = strtok_r (NULL, " ", &saveptr); continue; } else if (strncmp (token, "-enable-link-options", 20) == 0) { if (!linking) { APPEND_TO_MAIN_BUILD_LOG ("\"-enable-link-options\" flag is " "only valid when linking\n"); error = ret_error; goto ERROR; } if (!(*create_library)) { APPEND_TO_MAIN_BUILD_LOG ("\"-enable-link-options\" flag is " "only valid when -create-library " "option was given\n"); error = ret_error; goto ERROR; } enable_link_options = 1; token = strtok_r (NULL, " ", &saveptr); continue; } else { APPEND_TO_MAIN_BUILD_LOG ("Invalid build option: %s\n", token); error = ret_error; goto ERROR; } APPEND_TOKEN (); token = strtok_r (NULL, " ", &saveptr); } error = CL_SUCCESS; /* remove trailing whitespace */ i = strlen (modded_options); if ((i > 0) && (modded_options[i - 1] == ' ')) modded_options[i - 1] = 0; ERROR: POCL_MEM_FREE (temp_options); return error; }
cl_int pocl_create_command (_cl_command_node **cmd, cl_command_queue command_queue, cl_command_type command_type, cl_event *event_p, cl_int num_events, const cl_event *wait_list) { int i; int err; cl_event *event = NULL; /* the provided waiting list will be cloned, because the calling program * might recycle the array for a different command. */ cl_event *event_wl = NULL; /* Additionally, if the command queue is non-empty and in-order, we want to * add the previous command to the waiting list: double-bang to ensure that * add_prev_command will be 1 in this case, and 0 otherwise */ cl_int add_prev_command = !!( !(command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) && command_queue->root != NULL); if ((wait_list == NULL && num_events != 0) || (wait_list != NULL && num_events == 0)) return CL_INVALID_EVENT_WAIT_LIST; for (i = 0; i < num_events; ++i) { if (wait_list[i] == NULL) return CL_INVALID_EVENT_WAIT_LIST; } *cmd = pocl_mem_manager_new_command (); if (*cmd == NULL) return CL_OUT_OF_HOST_MEMORY; if (num_events || add_prev_command) { event_wl = (cl_event*)malloc((num_events + add_prev_command)*sizeof(cl_event)); if (event_wl == NULL) return CL_OUT_OF_HOST_MEMORY; } /* if user does not provide event pointer, create event anyway */ event = &((*cmd)->event); err = pocl_create_event(event, command_queue->context, command_queue, command_type); if (err != CL_SUCCESS) { POCL_MEM_FREE(event_wl); POCL_MEM_FREE(*cmd); return err; } if (event_p) *event_p = *event; else (*event)->implicit_event = 1; /* clone the event list */ for (i = 0; i < num_events; ++i) { event_wl[i] = wait_list[i]; } if (add_prev_command) { // find the previous command _cl_command_node *prev_command; for (prev_command = command_queue->root; prev_command->next != NULL; prev_command = prev_command->next){} //printf("create_command: prev_com=%d prev_com->event = %d \n",prev_command, prev_command->event); event_wl[i] = prev_command->event; } #if 0 for (i = 0; i < num_events + add_prev_command; ++i) { printf("create-command: event_wl[%i]=%p\n", i, event_wl[i]); } #endif (*cmd)->event_wait_list = event_wl; (*cmd)->num_events_in_wait_list = num_events + add_prev_command; (*cmd)->type = command_type; (*cmd)->next = NULL; (*cmd)->device = command_queue->device; //printf("create_command (end): event=%d new_event=%d cmd->event=%d cmd=%d\n", event, new_event, (*cmd)->event, *cmd); return CL_SUCCESS; }
static void exec_commands (_cl_command_node *node_list) { int i; cl_event *event = NULL; _cl_command_node *node; cl_command_queue command_queue = NULL; event_callback_item* cb_ptr; LL_FOREACH (node_list, node) { event = &(node->event); /* Command queue is needed for POCL_UPDATE_EVENT macros */ if (node->event) command_queue = node->event->queue; if (node->device->ops->compile_submitted_kernels) node->device->ops->compile_submitted_kernels (node); switch (node->type) { case CL_COMMAND_READ_BUFFER: POCL_UPDATE_EVENT_RUNNING(event, command_queue); node->device->ops->read (node->device->data, node->command.read.host_ptr, node->command.read.device_ptr, node->command.read.offset, node->command.read.cb); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); POname(clReleaseMemObject) (node->command.read.buffer); break; case CL_COMMAND_WRITE_BUFFER: POCL_UPDATE_EVENT_RUNNING(event, command_queue); node->device->ops->write (node->device->data, node->command.write.host_ptr, node->command.write.device_ptr, node->command.write.offset, node->command.write.cb); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); POname(clReleaseMemObject) (node->command.write.buffer); break; case CL_COMMAND_COPY_BUFFER: POCL_UPDATE_EVENT_RUNNING(event, command_queue); node->device->ops->copy (node->command.copy.data, node->command.copy.src_ptr, node->command.copy.src_offset, node->command.copy.dst_ptr, node->command.copy.dst_offset, node->command.copy.cb); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); POname(clReleaseMemObject) (node->command.copy.src_buffer); POname(clReleaseMemObject) (node->command.copy.dst_buffer); break; case CL_COMMAND_MAP_IMAGE: case CL_COMMAND_MAP_BUFFER: POCL_UPDATE_EVENT_RUNNING(event, command_queue); pocl_map_mem_cmd (node->device, node->command.map.buffer, node->command.map.mapping); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); break; case CL_COMMAND_WRITE_IMAGE: POCL_UPDATE_EVENT_RUNNING(event, command_queue); node->device->ops->write_rect (node->device->data, node->command.rw_image.host_ptr, node->command.rw_image.device_ptr, node->command.rw_image.origin, node->command.rw_image.origin, node->command.rw_image.region, node->command.rw_image.rowpitch, node->command.rw_image.slicepitch, node->command.rw_image.rowpitch, node->command.rw_image.slicepitch); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); break; case CL_COMMAND_READ_IMAGE: POCL_UPDATE_EVENT_RUNNING(event, command_queue); node->device->ops->read_rect (node->device->data, node->command.rw_image.host_ptr, node->command.rw_image.device_ptr, node->command.rw_image.origin, node->command.rw_image.origin, node->command.rw_image.region, node->command.rw_image.rowpitch, node->command.rw_image.slicepitch, node->command.rw_image.rowpitch, node->command.rw_image.slicepitch); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); break; case CL_COMMAND_UNMAP_MEM_OBJECT: POCL_UPDATE_EVENT_RUNNING(event, command_queue); if ((node->command.unmap.memobj)->flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { /* TODO: should we ensure the device global region is updated from the host memory? How does the specs define it, can the host_ptr be assumed to point to the host and the device accessible memory or just point there until the kernel(s) get executed or similar? */ /* Assume the region is automatically up to date. */ } else { /* TODO: fixme. The offset computation must be done at the device driver. */ if (node->device->ops->unmap_mem != NULL) node->device->ops->unmap_mem (node->device->data, (node->command.unmap.mapping)->host_ptr, (node->command.unmap.memobj)->device_ptrs[node->device->dev_id].mem_ptr, (node->command.unmap.mapping)->size); } DL_DELETE((node->command.unmap.memobj)->mappings, node->command.unmap.mapping); (node->command.unmap.memobj)->map_count--; POCL_UPDATE_EVENT_COMPLETE(event, command_queue); break; case CL_COMMAND_NDRANGE_KERNEL: assert (*event == node->event); POCL_UPDATE_EVENT_RUNNING(event, command_queue); node->device->ops->run(node->command.run.data, node); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); for (i = 0; i < node->command.run.arg_buffer_count; ++i) { cl_mem buf = node->command.run.arg_buffers[i]; if (buf == NULL) continue; /*printf ("### releasing arg %d - the buffer %x of kernel %s\n", i, buf, node->command.run.kernel->function_name); */ POname(clReleaseMemObject) (buf); } POCL_MEM_FREE(node->command.run.arg_buffers); POCL_MEM_FREE(node->command.run.tmp_dir); for (i = 0; i < node->command.run.kernel->num_args + node->command.run.kernel->num_locals; ++i) { pocl_aligned_free (node->command.run.arguments[i].value); node->command.run.arguments[i].value = NULL; } POCL_MEM_FREE(node->command.run.arguments); POname(clReleaseKernel)(node->command.run.kernel); break; case CL_COMMAND_NATIVE_KERNEL: POCL_UPDATE_EVENT_RUNNING(event, command_queue); node->device->ops->run_native(node->command.native.data, node); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); for (i = 0; i < node->command.native.num_mem_objects; ++i) { cl_mem buf = node->command.native.mem_list[i]; if (buf == NULL) continue; POname(clReleaseMemObject) (buf); } POCL_MEM_FREE(node->command.native.mem_list); POCL_MEM_FREE(node->command.native.args); break; case CL_COMMAND_FILL_IMAGE: POCL_UPDATE_EVENT_RUNNING(event, command_queue); node->device->ops->fill_rect (node->command.fill_image.data, node->command.fill_image.device_ptr, node->command.fill_image.buffer_origin, node->command.fill_image.region, node->command.fill_image.rowpitch, node->command.fill_image.slicepitch, node->command.fill_image.fill_pixel, node->command.fill_image.pixel_size); POCL_MEM_FREE(node->command.fill_image.fill_pixel); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); break; case CL_COMMAND_MARKER: POCL_UPDATE_EVENT_RUNNING(event, command_queue); POCL_UPDATE_EVENT_COMPLETE(event, command_queue); break; default: POCL_ABORT_UNIMPLEMENTED("clFinish: Unknown command"); break; } if (event) { /* event callback handling just call functions in the same order they were added */ for (cb_ptr = (*event)->callback_list; cb_ptr; cb_ptr = cb_ptr->next) { cb_ptr->callback_function ((*event), cb_ptr->trigger_status, cb_ptr->user_data); } if ((*event)->implicit_event) POname(clReleaseEvent) (*event); } }
int pocl_topology_detect_device_info (cl_device_id device) { device->global_mem_cacheline_size = HOST_CPU_CACHELINE_SIZE; device->global_mem_cache_type = 0x2; // CL_READ_WRITE_CACHE, without including all of CL/cl.h /* global mem cache size */ char *content; uint64_t filesize; if (pocl_read_file (L3_CACHE_SIZE, &content, &filesize) == 0) { long val = atol (content); device->global_mem_cache_size = val * 1024; POCL_MEM_FREE (content); } else { if (pocl_read_file (L2_CACHE_SIZE, &content, &filesize) == 0) { long val = atol (content); device->global_mem_cache_size = val * 1024; POCL_MEM_FREE (content); } else { POCL_MSG_WARN ( "Could not figure out CPU cache size, using bogus value\n"); device->global_mem_cache_size = 1 << 20; } } /* global_mem_size */ if (pocl_read_file (MEMINFO, &content, &filesize) == 0) { char *tmp = content; unsigned long memsize_kb; size_t i; while (*tmp && (*tmp != '\n')) ++tmp; *tmp = 0; tmp = content; while (*tmp && (*tmp != 0x20)) ++tmp; while (*tmp && (*tmp == 0x20)) ++tmp; int items = sscanf (tmp, "%lu kB", &memsize_kb); assert (items == 1); device->global_mem_size = memsize_kb * 1024; POCL_MEM_FREE (content); } else { POCL_MSG_WARN ("Cannot get memory size\n"); device->global_mem_size = 256 << 20; } /* max_compute_units */ pocl_read_file (CPUS, &content, &filesize); /* files in /sys report file size larger than can be actually read, so pocl_read_file() returns an error (even if it correctly reads the file). */ if (content && content[0] != 0) { unsigned long start, end; int items = sscanf (content, "%lu-%lu", &start, &end); assert (items == 2); device->max_compute_units = (unsigned)end + 1; POCL_MEM_FREE (content); } else { POCL_MSG_WARN ("Cannot get logical CPU number\n"); device->max_compute_units = 1; } return 0; }
/* creates either a program with binaries, or an empty program. The latter * is useful for clLinkProgram() which needs an empty program to put the * compiled results in. */ cl_program create_program_skeleton (cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const unsigned char **binaries, cl_int *binary_status, cl_int *errcode_ret, int allow_empty_binaries) { cl_program program; unsigned i,j; int errcode, is_spirv_opencl; cl_device_id *unique_devlist = NULL; POCL_GOTO_ERROR_COND((context == NULL), CL_INVALID_CONTEXT); POCL_GOTO_ERROR_COND((device_list == NULL), CL_INVALID_VALUE); POCL_GOTO_ERROR_COND((num_devices == 0), CL_INVALID_VALUE); if (!allow_empty_binaries) { POCL_GOTO_ERROR_COND ((lengths == NULL), CL_INVALID_VALUE); for (i = 0; i < num_devices; ++i) { POCL_GOTO_ERROR_ON ((lengths[i] == 0 || binaries[i] == NULL), CL_INVALID_VALUE, "%i-th binary is NULL or its length==0\n", i); } } // check for duplicates in device_list[]. for (i = 0; i < context->num_devices; i++) { int count = 0; for (j = 0; j < num_devices; j++) { count += context->devices[i] == device_list[j]; } // duplicate devices POCL_GOTO_ERROR_ON((count > 1), CL_INVALID_DEVICE, "device %s specified multiple times\n", context->devices[i]->long_name); } // convert subdevices to devices and remove duplicates cl_uint real_num_devices = 0; unique_devlist = pocl_unique_device_list(device_list, num_devices, &real_num_devices); num_devices = real_num_devices; device_list = unique_devlist; // check for invalid devices in device_list[]. for (i = 0; i < num_devices; i++) { int found = 0; for (j = 0; j < context->num_devices; j++) { found |= context->devices[j] == device_list[i]; } POCL_GOTO_ERROR_ON((!found), CL_INVALID_DEVICE, "device not found in the device list of the context\n"); } if ((program = (cl_program) calloc (1, sizeof (struct _cl_program))) == NULL) { errcode = CL_OUT_OF_HOST_MEMORY; goto ERROR; } POCL_INIT_OBJECT(program); if ((program->binary_sizes = (size_t*) calloc (num_devices, sizeof(size_t))) == NULL || (program->binaries = (unsigned char**) calloc (num_devices, sizeof(unsigned char*))) == NULL || (program->pocl_binaries = (unsigned char**) calloc (num_devices, sizeof(unsigned char*))) == NULL || (program->pocl_binary_sizes = (size_t*) calloc (num_devices, sizeof(size_t))) == NULL || (program->build_log = (char**) calloc (num_devices, sizeof(char*))) == NULL || ((program->llvm_irs = (void**) calloc (num_devices, sizeof(void*))) == NULL) || ((program->build_hash = (SHA1_digest_t*) calloc (num_devices, sizeof(SHA1_digest_t))) == NULL)) { errcode = CL_OUT_OF_HOST_MEMORY; goto ERROR_CLEAN_PROGRAM_AND_BINARIES; } program->context = context; program->num_devices = num_devices; program->devices = unique_devlist; program->build_status = CL_BUILD_NONE; program->binary_type = CL_PROGRAM_BINARY_TYPE_NONE; char program_bc_path[POCL_FILENAME_LENGTH]; if (allow_empty_binaries && (lengths == NULL) && (binaries == NULL)) goto SUCCESS; for (i = 0; i < num_devices; ++i) { /* LLVM IR */ if (!strncmp((const char *)binaries[i], "BC", 2)) { program->binary_sizes[i] = lengths[i]; program->binaries[i] = (unsigned char*) malloc(lengths[i]); memcpy (program->binaries[i], binaries[i], lengths[i]); if (binary_status != NULL) binary_status[i] = CL_SUCCESS; } /* SPIR-V binary needs to be converted, and requires * linking of the converted BC */ #ifdef OCS_AVAILABLE else if (bitcode_is_spirv ((const char *)binaries[i], lengths[i], &is_spirv_opencl)) { if (is_spirv_opencl == 0) { // SPIR-V but not OpenCL-type. POCL_GOTO_ERROR_ON ( 1, CL_BUILD_PROGRAM_FAILURE, "SPIR-V binary provided, but is not using Kernel mode." "Pocl can't process this binary.\n"); } int no_spir = strstr (device_list[i]->extensions, "cl_khr_spir") == NULL; POCL_GOTO_ERROR_ON ( no_spir, CL_BUILD_PROGRAM_FAILURE, "SPIR binary provided, but device has no SPIR support"); #ifdef ENABLE_SPIRV POCL_MSG_PRINT_LLVM ( "SPIR-V binary detected, converting to LLVM SPIR\n"); char program_bc_spirv[POCL_FILENAME_LENGTH]; char program_bc_temp[POCL_FILENAME_LENGTH]; pocl_cache_write_spirv (program_bc_spirv, (const char *)binaries[i], (uint64_t)lengths[i]); pocl_cache_tempname (program_bc_temp, ".bc", NULL); char *args[] = { LLVM_SPIRV, "-r", "-o", program_bc_temp, program_bc_spirv, NULL }; errcode = pocl_run_command (args); assert (errcode == 0); /* load LLVM SPIR binary. */ uint64_t fsize; char *content; pocl_read_file (program_bc_temp, &content, &fsize); program->binary_sizes[i] = fsize; program->binaries[i] = (unsigned char *)content; pocl_remove (program_bc_temp); #else POCL_GOTO_ERROR_ON ( 1, CL_BUILD_PROGRAM_FAILURE, "SPIR binary provided, but this pocl has no SPIR-V support." "SPIR-V support requires llvm-spirv converter binary.\n"); #endif } #endif /* Poclcc binary */ else if (pocl_binary_check_binary(device_list[i], binaries[i])) { program->pocl_binary_sizes[i] = lengths[i]; program->pocl_binaries[i] = (unsigned char*) malloc (lengths[i]); memcpy (program->pocl_binaries[i], binaries[i], lengths[i]); pocl_binary_set_program_buildhash (program, i, binaries[i]); int error = pocl_cache_create_program_cachedir (program, i, NULL, 0, program_bc_path); POCL_GOTO_ERROR_ON((error != 0), CL_BUILD_PROGRAM_FAILURE, "Could not create program cachedir"); POCL_GOTO_ERROR_ON(pocl_binary_deserialize (program, i), CL_INVALID_BINARY, "Could not unpack a pocl binary\n"); /* read program.bc, can be useful later */ if (pocl_exists (program_bc_path)) { pocl_read_file (program_bc_path, (char **)(&program->binaries[i]), (uint64_t *)(&program->binary_sizes[i])); } if (binary_status != NULL) binary_status[i] = CL_SUCCESS; } /* Unknown binary */ else { POCL_MSG_WARN ("Could not recognize binary\n"); if (binary_status != NULL) binary_status[i] = CL_INVALID_BINARY; errcode = CL_INVALID_BINARY; goto ERROR_CLEAN_PROGRAM_AND_BINARIES; } } SUCCESS: POCL_RETAIN_OBJECT(context); if (errcode_ret != NULL) *errcode_ret = CL_SUCCESS; return program; ERROR_CLEAN_PROGRAM_AND_BINARIES: if (program->binaries) for (i = 0; i < num_devices; ++i) POCL_MEM_FREE(program->binaries[i]); POCL_MEM_FREE(program->binaries); POCL_MEM_FREE(program->binary_sizes); if (program->pocl_binaries) for (i = 0; i < num_devices; ++i) POCL_MEM_FREE(program->pocl_binaries[i]); POCL_MEM_FREE(program->pocl_binaries); POCL_MEM_FREE(program->pocl_binary_sizes); /*ERROR_CLEAN_PROGRAM:*/ POCL_MEM_FREE(program); ERROR: POCL_MEM_FREE(unique_devlist); if(errcode_ret != NULL) { *errcode_ret = errcode; } return NULL; }
void pocl_hsa_run (void *data, _cl_command_node* cmd) { struct data *d; struct pocl_argument *al; unsigned i; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; hsa_signal_value_t initial_value = 1; #if 0 /* Not yet supported by the reference library. */ hsa_kernel_dispatch_packet_t kernel_packet; #else hsa_dispatch_packet_t kernel_packet; #endif hsa_signal_t kernel_completion_signal = 0; hsa_region_t region; int error; amdgpu_args_t *args; /* 32b word offset in the kernel arguments buffer we can push the next argument to. */ int args_offset = 0; assert (data != NULL); d = (struct data *) data; d->current_kernel = kernel; memset (&kernel_packet, 0, sizeof (hsa_dispatch_packet_t)); #if 0 /* TODO: not yet supported by the open source runtime implementation. Assume the HSA Full profile so we can simply use host malloc(). */ hsa_agent_iterate_regions(kernel_agent, pocl_hsa_get_kernarg, ®ion); if (hsa_memory_allocate(region, sizeof(amdgpu_args_t), (void**)&args) != HSA_STATUS_SUCCESS) { assert (0 && "hsa_memory_allocate() failed."); } #else args = (amdgpu_args_t*)malloc(sizeof(amdgpu_args_t)); #endif kernel_packet.kernarg_address = (uint64_t)args; /* Process the kernel arguments. Convert the opaque buffer pointers to real device pointers, allocate dynamic local memory buffers, etc. */ for (i = 0; i < kernel->num_args; ++i) { al = &(cmd->command.run.arguments[i]); if (kernel->arg_info[i].is_local) { POCL_ABORT_UNIMPLEMENTED("pocl-hsa: local buffers not implemented."); #if 0 arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc(data, 0, al->size, NULL); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER) { if (args_offset + 1 >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the pointers are 64b (or actually the same as in host) due to HSA. TODO: the 32b profile. */ if (al->value == NULL) { args->kernel_args[args_offset] = 0; args->kernel_args[args_offset + 1] = 0; } else { *(uint64_t*)&args->kernel_args[args_offset] = (uint64_t)(*(cl_mem *) (al->value))-> device_ptrs[cmd->device->dev_id].mem_ptr; } args_offset += 2; #if 0 /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) { arguments[i] = malloc (sizeof (void *)); *(void **)arguments[i] = NULL; } else arguments[i] = &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { POCL_ABORT_UNIMPLEMENTED("hsa: image arguments not implemented."); #if 0 dev_image_t di; fill_dev_image_t (&di, al, cmd->device); void* devptr = pocl_hsa_malloc (data, 0, sizeof(dev_image_t), NULL); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_hsa_write (data, &di, devptr, 0, sizeof(dev_image_t)); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { POCL_ABORT_UNIMPLEMENTED("hsa: sampler arguments not implemented."); #if 0 dev_sampler_t ds; arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, sizeof(dev_sampler_t), NULL); pocl_hsa_write (data, &ds, *(void**)arguments[i], 0, sizeof(dev_sampler_t)); #endif } else { if (args_offset >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the scalar fits to a 32b slot. TODO! */ assert (al->size <= 4); args->kernel_args[args_offset] = *(uint32_t*)al->value; ++args_offset; } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { POCL_ABORT_UNIMPLEMENTED("hsa: automatic local buffers not implemented."); #if 0 al = &(cmd->command.run.arguments[i]); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, al->size, NULL); #endif } args->workgroup_size_x = kernel_packet.workgroup_size_x = cmd->command.run.local_x; args->workgroup_size_y = kernel_packet.workgroup_size_y = cmd->command.run.local_y; args->workgroup_size_z = kernel_packet.workgroup_size_z = cmd->command.run.local_z; kernel_packet.grid_size_x = pc->num_groups[0] * cmd->command.run.local_x; kernel_packet.grid_size_y = pc->num_groups[1] * cmd->command.run.local_y; kernel_packet.grid_size_z = pc->num_groups[2] * cmd->command.run.local_z; /* AMDGPU specific OpenCL argument data. */ args->wgs_x = pc->num_groups[0]; args->wgs_y = pc->num_groups[1]; args->wgs_z = pc->num_groups[2]; kernel_packet.dimensions = 1; if (cmd->command.run.local_y > 1) kernel_packet.dimensions = 2; if (cmd->command.run.local_z > 1) kernel_packet.dimensions = 3; kernel_packet.header.type = HSA_PACKET_TYPE_DISPATCH; kernel_packet.header.acquire_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.release_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.barrier = 1; kernel_packet.kernel_object_address = *(hsa_amd_code_t*)cmd->command.run.device_data[1]; error = hsa_signal_create(initial_value, 0, NULL, &kernel_completion_signal); assert (error == HSA_STATUS_SUCCESS); kernel_packet.completion_signal = kernel_completion_signal; { /* Launch the kernel by allocating a slot in the queue, writing the command to it, signaling the update with a door bell and finally, block waiting until finish signalled with the completion_signal. */ const uint32_t queue_mask = d->queue->size - 1; uint64_t queue_index = hsa_queue_load_write_index_relaxed(d->queue); hsa_signal_value_t sigval; ((hsa_dispatch_packet_t*)(d->queue->base_address))[queue_index & queue_mask] = kernel_packet; hsa_queue_store_write_index_relaxed(d->queue, queue_index + 1); hsa_signal_store_relaxed(d->queue->doorbell_signal, queue_index); sigval = hsa_signal_wait_acquire(kernel_completion_signal, HSA_EQ, 0, (uint64_t)(-1), HSA_WAIT_EXPECTANCY_UNKNOWN); } for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_info[i].is_local) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } #if 0 else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER || (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)args->kernel_args[i] == NULL)) { POCL_MEM_FREE(arguments[i]); } #endif } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { #if 0 pocl_hsa_free(data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } free(args); }
void pocl_basic_free_ptr (cl_device_id device, void* mem_ptr) { /* TODO we should somehow figure out the size argument * and call pocl_free_global_mem */ POCL_MEM_FREE(mem_ptr); }
void pocl_basic_run (void *data, _cl_command_node* cmd) { struct data *d; struct pocl_argument *al; size_t x, y, z; unsigned i; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; assert (data != NULL); d = (struct data *) data; d->current_kernel = kernel; void **arguments = (void**)malloc( sizeof(void*) * (kernel->num_args + kernel->num_locals) ); /* Process the kernel arguments. Convert the opaque buffer pointers to real device pointers, allocate dynamic local memory buffers, etc. */ for (i = 0; i < kernel->num_args; ++i) { al = &(cmd->command.run.arguments[i]); if (kernel->arg_info[i].is_local) { arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER) { /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) { arguments[i] = malloc (sizeof (void *)); *(void **)arguments[i] = NULL; } else arguments[i] = &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { dev_image_t di; fill_dev_image_t (&di, al, cmd->device); void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_image_t)); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_basic_write (data, &di, devptr, 0, sizeof(dev_image_t)); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { dev_sampler_t ds; fill_dev_sampler_t(&ds, al); void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_sampler_t)); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_basic_write (data, &ds, devptr, 0, sizeof(dev_sampler_t)); } else { arguments[i] = al->value; } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { al = &(cmd->command.run.arguments[i]); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size); } for (z = 0; z < pc->num_groups[2]; ++z) { for (y = 0; y < pc->num_groups[1]; ++y) { for (x = 0; x < pc->num_groups[0]; ++x) { pc->group_id[0] = x; pc->group_id[1] = y; pc->group_id[2] = z; cmd->command.run.wg (arguments, pc); } } } for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_info[i].is_local) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE || kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)arguments[i] == NULL) { POCL_MEM_FREE(arguments[i]); } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } free(arguments); }
cl_int pocl_create_command (_cl_command_node **cmd, cl_command_queue command_queue, cl_command_type command_type, cl_event *event_p, cl_int num_events, const cl_event *wait_list) { int i; int err; cl_event *event = NULL; if ((wait_list == NULL && num_events != 0) || (wait_list != NULL && num_events == 0)) return CL_INVALID_EVENT_WAIT_LIST; for (i = 0; i < num_events; ++i) { if (wait_list[i] == NULL) return CL_INVALID_EVENT_WAIT_LIST; } *cmd = pocl_mem_manager_new_command (); if (*cmd == NULL) return CL_OUT_OF_HOST_MEMORY; /* if user does not provide event pointer, create event anyway */ event = &((*cmd)->event); err = pocl_create_event(event, command_queue, command_type); if (err != CL_SUCCESS) { POCL_MEM_FREE(*cmd); return err; } if (event_p) *event_p = *event; else (*event)->implicit_event = 1; /* if in-order command queue and queue is not empty, add event from previous command to new commands event_waitlist */ if (!(command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) && command_queue->root != NULL) { _cl_command_node *prev_command; for (prev_command = command_queue->root; prev_command->next != NULL; prev_command = prev_command->next){} //printf("create_command: prev_com=%d prev_com->event = %d \n",prev_command, prev_command->event); cl_event *new_wl = (cl_event*)malloc ((num_events +1)*sizeof (cl_event)); for (i = 0; i < num_events; ++i) { new_wl[i] = wait_list[i]; } new_wl[i] = prev_command->event; (*cmd)->event_wait_list = new_wl; (*cmd)->num_events_in_wait_list = num_events + 1; for (i = 0; i < num_events + 1; ++i) { //printf("create-command: new_wl[%i]=%d\n", i, new_wl[i]); } } else { (*cmd)->event_wait_list = wait_list; (*cmd)->num_events_in_wait_list = num_events; } (*cmd)->type = command_type; (*cmd)->next = NULL; (*cmd)->device = command_queue->device; //printf("create_command (end): event=%d new_event=%d cmd->event=%d cmd=%d\n", event, new_event, (*cmd)->event, *cmd); return CL_SUCCESS; }