/* * init_opencl_context_and_shmem * * We can have performance gain using asynchronous DMA transfer when data * chunk it moved to OpenCL device from host machine, however, it requires * preparations to ensure the memory region to be copied to/from is pinned * on RAM; not swapped out. OpenCL provides an interface to map a certain * host address area as pinned buffer object, even though its size is * restricted to CL_DEVICE_MAX_MEM_ALLOC_SIZE parameter. Usually, it is * much less than size of shared memory to be assigned to PG-Strom, around * 500MB - 2GB in typical GPU/MIC device. So, we need to split a flat * continuous memory into several 'zones' to pin it using OpenCL interface. * Because it is a job of OpenCL intermediation server to collect properties * of devices, and this server shall be launched post initialization stage, * we also have to acquire and pin the shared memory region in the context * of OpenCL intermediation server, not postmaster itself. */ static void init_opencl_context_and_shmem(void) { Size zone_length = LONG_MAX; cl_int i, rc; /* * Create an OpenCL context */ opencl_context = clCreateContext(NULL, opencl_num_devices, opencl_devices, NULL, NULL, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateContext failed: %s", opencl_strerror(rc)); /* * Create an OpenCL command queue for each device */ for (i=0; i < opencl_num_devices; i++) { const pgstrom_device_info *dev_info = pgstrom_get_device_info(i); opencl_cmdq[i] = clCreateCommandQueue(opencl_context, opencl_devices[i], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateCommandQueue failed: %s", opencl_strerror(rc)); if (zone_length > dev_info->dev_max_mem_alloc_size) zone_length = (dev_info->dev_max_mem_alloc_size & ~((1UL << 20) - 1)); } /* Lock shared memory of PG-Strom's private area */ pgstrom_setup_shmem(zone_length, on_shmem_zone_callback); /* Lock shared memory of shared buffer area */ if (!on_shmem_zone_callback(BufferBlocks, NBuffers * (Size) BLCKSZ, "buffer", false)) { Size total_size = NBuffers * (Size) BLCKSZ; Size offset; Assert((zone_length & (BLCKSZ - 1)) == 0); for (offset = 0; offset < total_size; offset += zone_length) { on_shmem_zone_callback(BufferBlocks + offset, Min(zone_length, total_size - offset), "buffer", true); } } }
/* * pgstrom_strerror * * translation from StromError_* to human readable form */ const char * pgstrom_strerror(cl_int errcode) { static char unknown_buf[256]; if (errcode < 0) return opencl_strerror(errcode); switch (errcode) { case StromError_Success: return "success"; case StromError_RowFiltered: return "row is filtered"; case StromError_RowReCheck: return "row should be rechecked"; case StromError_ServerNotReady: return "OpenCL server is not ready"; case StromError_BadRequestMessage: return "request message is bad"; case StromError_OpenCLInternal: return "OpenCL internal error"; case StromError_OutOfSharedMemory: return "out of shared memory"; case StromError_DivisionByZero: return "division by zero"; default: snprintf(unknown_buf, sizeof(unknown_buf), "undefined strom error (code: %d)", errcode); break; } return unknown_buf; }
int main(int argc, char *argv[]) { cl_platform_id platform_ids[32]; cl_uint platform_num; cl_int i, c, rc; while ((c = getopt(argc, argv, "lp:d:")) != -1) { switch (c) { case 'l': only_list = 1; break; case 'p': only_platform = atoi(optarg); break; case 'd': only_device = atoi(optarg); break; default: fprintf(stderr, "usage: %s [-l] [-p <platform>] [-d <device>]\n", basename(argv[0])); return 1; } } rc = clGetPlatformIDs(lengthof(platform_ids), platform_ids, &platform_num); if (rc != CL_SUCCESS) { fprintf(stderr, "failed on clGetPlatformIDs (%s)", opencl_strerror(rc)); return 1; } for (i=0; i < platform_num; i++) { if (only_platform < 0 || i + 1 == only_platform) dump_platform(i, platform_ids[i]); } return 0; }
/* * on_shmem_zone_callback * * It is a callback function for each zone on shared memory segment * initialization. It assigns a buffer object of OpenCL for each zone * for asynchronous memory transfer later. */ static void * on_shmem_zone_callback(void *address, Size length) { cl_mem host_mem; cl_int rc; host_mem = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, length, address, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateBuffer failed on host memory (%p-%p): %s", address, (char *)address + length - 1, opencl_strerror(rc)); elog(LOG, "PG-Strom: zone %p-%p was mapped (len: %luMB)", address, (char *)address + length - 1, length >> 20); return host_mem; }
/* * pgstrom_strerror * * translation from StromError_* to human readable form */ const char * pgstrom_strerror(cl_int errcode) { static char unknown_buf[256]; if (errcode < 0) return opencl_strerror(errcode); switch (errcode) { case StromError_Success: return "Success"; case StromError_RowFiltered: return "Row is filtered"; case StromError_CpuReCheck: return "To be re-checked by CPU"; case StromError_ServerNotReady: return "OpenCL server is not ready"; case StromError_BadRequestMessage: return "Request message is bad"; case StromError_OpenCLInternal: return "OpenCL internal error"; case StromError_OutOfSharedMemory: return "out of shared memory"; case StromError_OutOfMemory: return "out of host memory"; case StromError_DataStoreCorruption: return "data store is corrupted"; case StromError_DataStoreNoSpace: return "data store has no space"; case StromError_DataStoreOutOfRange: return "out of range in data store"; case StromError_SanityCheckViolation: return "sanity check violation"; default: snprintf(unknown_buf, sizeof(unknown_buf), "undefined strom error (code: %d)", errcode); break; } return unknown_buf; }
/* * on_shmem_zone_callback * * It is a callback function for each zone on shared memory segment * initialization. It assigns a buffer object of OpenCL for each zone * for asynchronous memory transfer later. */ static bool on_shmem_zone_callback(void *address, Size length, const char *label, bool abort_on_error) { cl_int rc; (void)clCreateBuffer(opencl_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, length, address, &rc); if (rc != CL_SUCCESS) { if (abort_on_error) elog(ERROR, "clCreateBuffer failed on host memory (%p-%p): %s", address, (char *)address + length - 1, opencl_strerror(rc)); return false; } elog(LOG, "PG-Strom: %s %p-%p was mapped (len: %luMB)", label, address, (char *)address + length - 1, length >> 20); return true; }
static void run_test(const char *namebuf, cl_context context, cl_command_queue cmdq) { cl_event *ev; char *hmem; cl_mem dmem; cl_mem pinned = NULL; cl_int num_chunks; cl_int rc, i, j, k; struct timeval tv1, tv2; num_chunks = buffer_size / chunk_size; ev = malloc(sizeof(cl_event) * (num_chunks + 1) * num_trial); if (!ev) error_exit("out of memory (%s)", strerror(rc)); hmem = malloc(buffer_size); if (!hmem) error_exit("out of memory (%s)", strerror(rc)); dmem = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, &rc); if (rc != CL_SUCCESS) error_exit("failed on clCreateBuffer(size=%lu) (%s)", buffer_size, opencl_strerror(rc)); gettimeofday(&tv1, NULL); if (!is_blocking) { pinned = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, buffer_size, hmem, &rc); if (rc != CL_SUCCESS) error_exit("failed on clCreateBuffer(size=%lu) (%s)", buffer_size, opencl_strerror(rc)); } for (i=0, k=0; i < num_trial; i++) { for (j=0; j < num_chunks; j++) { rc = clEnqueueWriteBuffer(cmdq, dmem, is_blocking, j * chunk_size, chunk_size, hmem + j * chunk_size, i > 0 ? 1 : 0, i > 0 ? &ev[k-1] : NULL, &ev[k+j]); if (rc != CL_SUCCESS) error_exit("failed on clEnqueueWriteBuffer (%s)", opencl_strerror(rc)); } rc = clEnqueueReadBuffer(cmdq, dmem, is_blocking, 0, buffer_size, hmem, num_chunks, &ev[k], &ev[k+num_chunks]); if (rc != CL_SUCCESS) error_exit("failed on clEnqueueReadBuffer (%s)", opencl_strerror(rc)); k += num_chunks + 1; } rc = clFinish(cmdq); if (rc != CL_SUCCESS) error_exit("failed on clFinish (%s)", opencl_strerror(rc)); gettimeofday(&tv2, NULL); printf("DMA send/recv test result\n" "device: %s\n" "size: %luMB\n" "chunks: %lu%s x %d\n" "ntrials: %d\n" "total_size: %luMB\n" "time: %.2fs\n" "speed: %.2fMB/s\n" "mode: %s\n", namebuf, buffer_size >> 20, chunk_size > (1UL<<20) ? chunk_size >> 20 : chunk_size >> 10, chunk_size > (1UL<<20) ? "MB" : "KB", num_chunks, num_trial, (buffer_size >> 20) * num_trial, (double)((tv2.tv_sec * 1000000 + tv2.tv_usec) - (tv1.tv_sec * 1000000 + tv1.tv_usec)) / 1000000.0, (double)(((buffer_size >> 20) * num_trial) * 1000000) / (double)((tv2.tv_sec * 1000000 + tv2.tv_usec) - (tv1.tv_sec * 1000000 + tv1.tv_usec)), is_blocking ? "sync" : "async"); /* release resources */ clReleaseMemObject(dmem); free(hmem); free(ev); }
int main(int argc, char *argv[]) { cl_platform_id platform_ids[32]; cl_int platform_num; cl_device_id device_ids[256]; cl_int device_num; cl_context context; cl_command_queue cmdq; cl_int c, rc; char namebuf[1024]; while ((c = getopt(argc, argv, "p:d:m:n:s:c:")) >= 0) { switch (c) { case 'p': platform_idx = atoi(optarg); break; case 'd': device_idx = atoi(optarg); break; case 'm': if (strcmp(optarg, "sync") == 0) is_blocking = CL_TRUE; else if (strcmp(optarg, "async") == 0) is_blocking = CL_FALSE; else usage(basename(argv[0])); break; case 'n': num_trial = atoi(optarg); break; case 's': buffer_size = atoi(optarg) << 20; break; case 'c': chunk_size = atoi(optarg) << 10; break; default: usage(basename(argv[0])); break; } } if (optind != argc) usage(basename(argv[0])); if (chunk_size == 0) chunk_size = buffer_size; else if (buffer_size % chunk_size != 0 || buffer_size < chunk_size) { fprintf(stderr, "chunk_size (-c) must be aligned to buffer_size\n"); return 1; } /* * Initialize OpenCL platform/device */ opencl_entry_init(); /* Get platform IDs */ rc = clGetPlatformIDs(lengthof(platform_ids), platform_ids, &platform_num); if (rc != CL_SUCCESS) error_exit("failed on clGetPlatformIDs (%s)", opencl_strerror(rc)); if (platform_idx < 1 || platform_idx > platform_num) error_exit("opencl platform index %d did not exist", platform_idx); /* Get device IDs */ rc = clGetDeviceIDs(platform_ids[platform_idx - 1], CL_DEVICE_TYPE_ALL, lengthof(device_ids), device_ids, &device_num); if (rc != CL_SUCCESS) error_exit("failed on clGetDeviceIDs (%s)\n", opencl_strerror(rc)); if (device_idx < 1 || device_idx > device_num) error_exit("opencl device index %d did not exist", device_idx); /* Get name of opencl device */ rc = clGetDeviceInfo(device_ids[device_idx - 1], CL_DEVICE_NAME, sizeof(namebuf), namebuf, NULL); if (rc != CL_SUCCESS) error_exit("failed on clGetDeviceInfo (%s)", opencl_strerror(rc)); /* Construct an OpenCL context */ context = clCreateContext(NULL, 1, &device_ids[device_idx - 1], NULL, NULL, &rc); if (rc != CL_SUCCESS) error_exit("failed to create an opencl context (%s)", opencl_strerror(rc)); /* Construct an OpenCL command queue */ cmdq = clCreateCommandQueue(context, device_ids[device_idx - 1], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &rc); if (rc != CL_SUCCESS) error_exit("failed to create an opencl command queue (%s)", opencl_strerror(rc)); /* do the job */ run_test(namebuf, context, cmdq); /* cleanup resources */ clReleaseCommandQueue(cmdq); clReleaseContext(context); return 0; }
/* * pgstrom_collect_device_info * * It collects properties of all the OpenCL devices. It shall be called once * by the OpenCL management worker process, prior to any other backends. */ static List * construct_opencl_device_info(int platform_index) { cl_platform_id platforms[32]; cl_device_id devices[MAX_NUM_DEVICES]; cl_uint n_platform; cl_uint n_devices; cl_int i, j, rc; long score_max = -1; List *result = NIL; rc = clGetPlatformIDs(lengthof(platforms), platforms, &n_platform); if (rc != CL_SUCCESS) elog(ERROR, "clGetPlatformIDs failed (%s)", opencl_strerror(rc)); for (i=0; i < n_platform; i++) { pgstrom_platform_info *pl_info; pgstrom_device_info *dev_info; long score = 0; List *temp = NIL; pl_info = collect_opencl_platform_info(platforms[i]); pl_info->pl_index = i; rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, lengthof(devices), devices, &n_devices); if (rc != CL_SUCCESS) elog(ERROR, "clGetDeviceIDs failed (%s)", opencl_strerror(rc)); elog(LOG, "PG-Strom: [%d] OpenCL Platform: %s", i, pl_info->pl_name); for (j=0; j < n_devices; j++) { dev_info = collect_opencl_device_info(devices[j]); dev_info->pl_info = pl_info; dev_info->dev_index = j; elog(LOG, "PG-Strom: + device %s (%uMHz x %uunits, %luMB)", dev_info->dev_name, dev_info->dev_max_clock_frequency, dev_info->dev_max_compute_units, dev_info->dev_global_mem_size >> 20); /* rough estimation about computing power */ if ((dev_info->dev_type & CL_DEVICE_TYPE_GPU) != 0) score += 32 * (dev_info->dev_max_compute_units * dev_info->dev_max_clock_frequency); else score += (dev_info->dev_max_compute_units * dev_info->dev_max_clock_frequency); temp = lappend(temp, dev_info); } if (platform_index == i || (platform_index < 0 && score > score_max)) { opencl_platform_id = platforms[i]; opencl_num_devices = n_devices; for (j=0; j < n_devices; j++) opencl_devices[j] = devices[j]; score_max = score; result = temp; } } /* show platform name if auto-selection */ if (platform_index < 0 && result != NIL) { pgstrom_platform_info *pl_info = ((pgstrom_device_info *) linitial(result))->pl_info; elog(LOG, "PG-Strom: auto platform selection: %s", pl_info->pl_name); } if (result != NIL) { /* * Create an OpenCL context */ opencl_context = clCreateContext(NULL, opencl_num_devices, opencl_devices, NULL, NULL, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateContext failed: %s", opencl_strerror(rc)); /* * Create an OpenCL command queue for each device */ for (j=0; j < opencl_num_devices; j++) { opencl_cmdq[j] = clCreateCommandQueue(opencl_context, opencl_devices[j], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateCommandQueue failed: %s", opencl_strerror(rc)); } } return result; }
pgstrom_device_info * collect_opencl_device_info(cl_device_id device_id) { pgstrom_device_info *dev_info; Size offset = 0; Size buflen = 10240; cl_int i, rc; int major, minor; static struct { cl_uint param; size_t size; size_t offset; bool is_cstring; } catalog[] = { CLDEV_PARAM(CL_DEVICE_ADDRESS_BITS, dev_address_bits, false), CLDEV_PARAM(CL_DEVICE_AVAILABLE, dev_available, false), CLDEV_PARAM(CL_DEVICE_COMPILER_AVAILABLE, dev_compiler_available, false), CLDEV_PARAM(CL_DEVICE_DOUBLE_FP_CONFIG, dev_double_fp_config, false), CLDEV_PARAM(CL_DEVICE_ENDIAN_LITTLE, dev_endian_little, false), CLDEV_PARAM(CL_DEVICE_ERROR_CORRECTION_SUPPORT, dev_error_correction_support, false), CLDEV_PARAM(CL_DEVICE_EXECUTION_CAPABILITIES, dev_execution_capabilities, false), CLDEV_PARAM(CL_DEVICE_EXTENSIONS, dev_device_extensions, true), CLDEV_PARAM(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, dev_global_mem_cache_size, false), CLDEV_PARAM(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, dev_global_mem_cache_type, false), CLDEV_PARAM(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, dev_global_mem_cacheline_size, false), CLDEV_PARAM(CL_DEVICE_GLOBAL_MEM_SIZE, dev_global_mem_size, false), CLDEV_PARAM(CL_DEVICE_HOST_UNIFIED_MEMORY, dev_host_unified_memory, false), CLDEV_PARAM(CL_DEVICE_LOCAL_MEM_SIZE, dev_local_mem_size, false), CLDEV_PARAM(CL_DEVICE_LOCAL_MEM_TYPE, dev_local_mem_type, false), CLDEV_PARAM(CL_DEVICE_MAX_CLOCK_FREQUENCY, dev_max_clock_frequency, false), CLDEV_PARAM(CL_DEVICE_MAX_COMPUTE_UNITS, dev_max_compute_units, false), CLDEV_PARAM(CL_DEVICE_MAX_CONSTANT_ARGS, dev_max_constant_args, false), CLDEV_PARAM(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, dev_max_constant_buffer_size, false), CLDEV_PARAM(CL_DEVICE_MAX_MEM_ALLOC_SIZE, dev_max_mem_alloc_size, false), CLDEV_PARAM(CL_DEVICE_MAX_PARAMETER_SIZE, dev_max_parameter_size, false), CLDEV_PARAM(CL_DEVICE_MAX_SAMPLERS, dev_max_samplers, false), CLDEV_PARAM(CL_DEVICE_MAX_WORK_GROUP_SIZE, dev_max_work_group_size, false), CLDEV_PARAM(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, dev_max_work_item_dimensions, false), CLDEV_PARAM(CL_DEVICE_MAX_WORK_ITEM_SIZES, dev_max_work_item_sizes, false), CLDEV_PARAM(CL_DEVICE_MEM_BASE_ADDR_ALIGN, dev_mem_base_addr_align, false), CLDEV_PARAM(CL_DEVICE_NAME, dev_name, true), CLDEV_PARAM(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, dev_native_vector_width_char, false), CLDEV_PARAM(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, dev_native_vector_width_short, false), CLDEV_PARAM(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, dev_native_vector_width_int, false), CLDEV_PARAM(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, dev_native_vector_width_long, false), CLDEV_PARAM(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, dev_native_vector_width_float, false), CLDEV_PARAM(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, dev_native_vector_width_double, false), CLDEV_PARAM(CL_DEVICE_OPENCL_C_VERSION, dev_opencl_c_version, true), CLDEV_PARAM(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, dev_preferred_vector_width_char, false), CLDEV_PARAM(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, dev_preferred_vector_width_short, false), CLDEV_PARAM(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, dev_preferred_vector_width_int, false), CLDEV_PARAM(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, dev_preferred_vector_width_long, false), CLDEV_PARAM(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, dev_preferred_vector_width_float, false), CLDEV_PARAM(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, dev_preferred_vector_width_double, false), CLDEV_PARAM(CL_DEVICE_PROFILE, dev_profile, true), CLDEV_PARAM(CL_DEVICE_PROFILING_TIMER_RESOLUTION, dev_profiling_timer_resolution, false), CLDEV_PARAM(CL_DEVICE_QUEUE_PROPERTIES, dev_queue_properties, false), CLDEV_PARAM(CL_DEVICE_SINGLE_FP_CONFIG, dev_single_fp_config, false), CLDEV_PARAM(CL_DEVICE_TYPE, dev_type, false), CLDEV_PARAM(CL_DEVICE_VENDOR, dev_vendor, true), CLDEV_PARAM(CL_DEVICE_VENDOR_ID, dev_vendor_id, false), CLDEV_PARAM(CL_DEVICE_VERSION, dev_version, true), CLDEV_PARAM(CL_DRIVER_VERSION, driver_version, true) }; dev_info = palloc(offsetof(pgstrom_device_info, buffer[buflen])); memset(dev_info, 0, sizeof(pgstrom_device_info)); for (i=0; i < lengthof(catalog); i++) { size_t param_size; size_t param_retsz; char *param_addr; if (!catalog[i].is_cstring) { param_size = catalog[i].size; param_addr = (char *)dev_info + catalog[i].offset; } else { Assert(catalog[i].size == sizeof(char *)); param_size = buflen - offset; param_addr = &dev_info->buffer[offset]; } rc = clGetDeviceInfo(device_id, catalog[i].param, param_size, param_addr, ¶m_retsz); if (rc != CL_SUCCESS) elog(ERROR, "failed on clGetDeviceInfo (param=%d, %s)", catalog[i].param, opencl_strerror(rc)); Assert(param_size == param_retsz || catalog[i].is_cstring); if (catalog[i].is_cstring) { param_addr[param_retsz] = '\0'; *((char **)((char *)dev_info + catalog[i].offset)) = param_addr; offset += MAXALIGN(param_retsz); } } dev_info->buflen = offset; /* * Check device capability is enough to run PG-Strom */ if (strcmp(dev_info->dev_profile, "FULL_PROFILE") != 0) { elog(LOG, "Profile of OpenCL device \"%s\" is \"%s\", skipped", dev_info->dev_name, dev_info->dev_profile); goto out_clean; } if ((dev_info->dev_type & (CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR)) == 0) { elog(LOG, "Only CPU, GPU or Accelerator are supported, skipped"); goto out_clean; } if (!dev_info->dev_available) { elog(LOG, "OpenCL device \"%s\" is not available, skipped", dev_info->dev_name); goto out_clean; } if (!dev_info->dev_compiler_available) { elog(LOG, "OpenCL compiler of device \"%s\" is not available, skipped", dev_info->dev_name); goto out_clean; } if (!dev_info->dev_endian_little) { elog(LOG, "OpenCL device \"%s\" has big endian, not supported", dev_info->dev_name); goto out_clean; } if (sscanf(dev_info->dev_opencl_c_version, "OpenCL C %d.%d ", &major, &minor) != 2 || major < 1 || (major == 1 && minor < 1)) { elog(LOG, "OpenCL C version of \"%s\"is too old \"%s\", skipped", dev_info->dev_name, dev_info->dev_opencl_c_version); goto out_clean; } if (dev_info->dev_max_work_item_dimensions != 3) { elog(LOG, "OpenCL device \"%s\" has work item dimensions larger than 3, skipped", dev_info->dev_name); goto out_clean; } return dev_info; out_clean: pfree(dev_info); return NULL; }
pgstrom_platform_info * collect_opencl_platform_info(cl_platform_id platform_id) { pgstrom_platform_info *pl_info; Size offset = 0; Size buflen = 10240; cl_int i, rc; int major, minor; static struct { cl_uint param; size_t size; size_t offset; bool is_cstring; } catalog[] = { CLPF_PARAM(CL_PLATFORM_PROFILE, pl_profile, true), CLPF_PARAM(CL_PLATFORM_VERSION, pl_version, true), CLPF_PARAM(CL_PLATFORM_NAME, pl_name, true), CLPF_PARAM(CL_PLATFORM_VENDOR, pl_vendor, true), CLPF_PARAM(CL_PLATFORM_EXTENSIONS, pl_extensions, true), }; pl_info = palloc(offsetof(pgstrom_platform_info, buffer[buflen])); memset(pl_info, 0, sizeof(pgstrom_platform_info)); /* collect platform properties */ for (i=0; i < lengthof(catalog); i++) { size_t param_size; size_t param_retsz; char *param_addr; if (!catalog[i].is_cstring) { param_size = catalog[i].size; param_addr = (char *)pl_info + catalog[i].offset; } else { Assert(catalog[i].size == sizeof(char *)); param_size = buflen - offset; param_addr = &pl_info->buffer[offset]; } rc = clGetPlatformInfo(platform_id, catalog[i].param, param_size, param_addr, ¶m_retsz); if (rc != CL_SUCCESS) elog(ERROR, "failed on clGetPlatformInfo (param=%d, %s)", catalog[i].param, opencl_strerror(rc)); Assert(param_size == param_retsz || catalog[i].is_cstring); if (catalog[i].is_cstring) { param_addr[param_retsz] = '\0'; *((char **)((char *)pl_info + catalog[i].offset)) = param_addr; offset += MAXALIGN(param_retsz); } } pl_info->buflen = offset; if (strcmp(pl_info->pl_profile, "FULL_PROFILE") != 0) { elog(LOG, "Profile of OpenCL driver \"%s\" is \"%s\", skipped", pl_info->pl_name, pl_info->pl_profile); goto out_clean; } if (sscanf(pl_info->pl_version, "OpenCL %d.%d ", &major, &minor) != 2 || major < 1 || (major == 1 && minor < 1)) { elog(LOG, "OpenCL version of \"%s\" is too old \"%s\", skipped", pl_info->pl_name, pl_info->pl_version); goto out_clean; } return pl_info; out_clean: pfree(pl_info); return NULL; }
static void dump_platform(int index, cl_platform_id platform_id) { static struct { cl_platform_info info; size_t size; void *addr; } catalog[] = { PLATFORM_ATTR(CL_PLATFORM_PROFILE, profile), PLATFORM_ATTR(CL_PLATFORM_VERSION, version), PLATFORM_ATTR(CL_PLATFORM_NAME, name), PLATFORM_ATTR(CL_PLATFORM_VENDOR, vendor), PLATFORM_ATTR(CL_PLATFORM_EXTENSIONS, extensions), }; cl_device_id device_ids[256]; cl_uint device_num; cl_int i, rc; for (i=0; i < lengthof(catalog); i++) { rc = clGetPlatformInfo(platform_id, catalog[i].info, catalog[i].size, catalog[i].addr, NULL); if (rc != CL_SUCCESS) { fprintf(stderr, "failed on clGetPlatformInfo (%s)\n", opencl_strerror(rc)); exit(1); } } rc = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, lengthof(device_ids), device_ids, &device_num); if (rc != CL_SUCCESS) { fprintf(stderr, "failed on clGetDeviceIDs (%s)\n", opencl_strerror(rc)); exit(1); } if (only_list) printf("Platform-%02d: %s / %s - %s\n", index + 1, platform_info.vendor, platform_info.name, platform_info.version); else { printf("platform-index: %d\n", index + 1); printf("platform-vendor: %s\n", platform_info.vendor); printf("platform-name: %s\n", platform_info.name); printf("platform-version: %s\n", platform_info.version); printf("platform-profile: %s\n", platform_info.profile); printf("platform-extensions: %s\n", platform_info.extensions); } for (i=0; i < device_num; i++) { if (only_device < 0 || i + 1 == only_device) dump_device(i, device_ids[i]); } putchar('\n'); }
static void dump_device(int index, cl_device_id device_id) { static struct { cl_device_info info; size_t size; void *addr; } catalog[] = { DEVICE_ATTR(CL_DEVICE_ADDRESS_BITS, address_bits), DEVICE_ATTR(CL_DEVICE_AVAILABLE, available), DEVICE_ATTR(CL_DEVICE_COMPILER_AVAILABLE, compiler_available), DEVICE_ATTR(CL_DEVICE_DOUBLE_FP_CONFIG, double_fp_config), DEVICE_ATTR(CL_DEVICE_ENDIAN_LITTLE, endian_little), DEVICE_ATTR(CL_DEVICE_ERROR_CORRECTION_SUPPORT, error_correction_support), DEVICE_ATTR(CL_DEVICE_EXECUTION_CAPABILITIES, execution_capabilities), DEVICE_ATTR(CL_DEVICE_EXTENSIONS, extensions), DEVICE_ATTR(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, global_mem_cache_size), DEVICE_ATTR(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, global_mem_cache_type), DEVICE_ATTR(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, global_mem_cacheline_size), DEVICE_ATTR(CL_DEVICE_GLOBAL_MEM_SIZE, global_mem_size), DEVICE_ATTR(CL_DEVICE_HALF_FP_CONFIG, half_fp_config), DEVICE_ATTR(CL_DEVICE_HOST_UNIFIED_MEMORY, host_unified_memory), DEVICE_ATTR(CL_DEVICE_IMAGE_SUPPORT, image_support), DEVICE_ATTR(CL_DEVICE_IMAGE2D_MAX_HEIGHT, image2d_max_height), DEVICE_ATTR(CL_DEVICE_IMAGE2D_MAX_WIDTH, image2d_max_width), DEVICE_ATTR(CL_DEVICE_IMAGE3D_MAX_DEPTH, image3d_max_depth), DEVICE_ATTR(CL_DEVICE_IMAGE3D_MAX_HEIGHT, image3d_max_height), DEVICE_ATTR(CL_DEVICE_IMAGE3D_MAX_WIDTH, image3d_max_width), DEVICE_ATTR(CL_DEVICE_LOCAL_MEM_SIZE, local_mem_size), DEVICE_ATTR(CL_DEVICE_LOCAL_MEM_TYPE, local_mem_type), DEVICE_ATTR(CL_DEVICE_MAX_CLOCK_FREQUENCY, max_clock_frequency), DEVICE_ATTR(CL_DEVICE_MAX_COMPUTE_UNITS, max_compute_units), DEVICE_ATTR(CL_DEVICE_MAX_CONSTANT_ARGS, max_constant_args), DEVICE_ATTR(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, max_constant_buffer_size), DEVICE_ATTR(CL_DEVICE_MAX_MEM_ALLOC_SIZE, max_mem_alloc_size), DEVICE_ATTR(CL_DEVICE_MAX_PARAMETER_SIZE, max_parameter_size), DEVICE_ATTR(CL_DEVICE_MAX_READ_IMAGE_ARGS, max_read_image_args), DEVICE_ATTR(CL_DEVICE_MAX_SAMPLERS, max_samplers), DEVICE_ATTR(CL_DEVICE_MAX_WORK_GROUP_SIZE, max_work_group_size), DEVICE_ATTR(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, max_work_item_dimensions), DEVICE_ATTR(CL_DEVICE_MAX_WORK_ITEM_SIZES, max_work_item_sizes), DEVICE_ATTR(CL_DEVICE_MAX_WRITE_IMAGE_ARGS, max_write_image_args), DEVICE_ATTR(CL_DEVICE_MEM_BASE_ADDR_ALIGN, mem_base_addr_align), DEVICE_ATTR(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, min_data_type_align_size), DEVICE_ATTR(CL_DEVICE_NAME, name), DEVICE_ATTR(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, native_vector_width_char), DEVICE_ATTR(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, native_vector_width_short), DEVICE_ATTR(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, native_vector_width_int), DEVICE_ATTR(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, native_vector_width_long), DEVICE_ATTR(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, native_vector_width_float), DEVICE_ATTR(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, native_vector_width_double), DEVICE_ATTR(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, native_vector_width_half), DEVICE_ATTR(CL_DEVICE_OPENCL_C_VERSION, opencl_c_version), DEVICE_ATTR(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, preferred_vector_width_char), DEVICE_ATTR(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, preferred_vector_width_short), DEVICE_ATTR(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, preferred_vector_width_int), DEVICE_ATTR(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, preferred_vector_width_long), DEVICE_ATTR(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, preferred_vector_width_float), DEVICE_ATTR(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, preferred_vector_width_double), DEVICE_ATTR(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, preferred_vector_width_half), DEVICE_ATTR(CL_DEVICE_PROFILE, profile), DEVICE_ATTR(CL_DEVICE_PROFILING_TIMER_RESOLUTION, profiling_timer_resolution), DEVICE_ATTR(CL_DEVICE_QUEUE_PROPERTIES, queue_properties), DEVICE_ATTR(CL_DEVICE_SINGLE_FP_CONFIG, single_fp_config), DEVICE_ATTR(CL_DEVICE_TYPE, type), DEVICE_ATTR(CL_DEVICE_VENDOR, vendor), DEVICE_ATTR(CL_DEVICE_VENDOR_ID, vendor_id), DEVICE_ATTR(CL_DEVICE_VERSION, version), DEVICE_ATTR(CL_DRIVER_VERSION, driver_version), }; cl_int i, rc; for (i=0; i < lengthof(catalog); i++) { rc = clGetDeviceInfo(device_id, catalog[i].info, catalog[i].size, catalog[i].addr, NULL); if (rc != CL_SUCCESS && !(rc == CL_INVALID_VALUE && (catalog[i].info == CL_DEVICE_DOUBLE_FP_CONFIG || catalog[i].info == CL_DEVICE_HALF_FP_CONFIG))) { fprintf(stderr, "failed on clGetDeviceInfo (%s)\n", opencl_strerror(rc)); exit(1); } } if (only_list) printf(" Device-%02d: %s / %s - %s\n", index + 1, dinfo.vendor, dinfo.name, dinfo.version); else { printf(" Device-%02d\n", index + 1); printf(" Device type: %s\n", dev_type_str(dinfo.type)); printf(" Vendor: %s (id: %08x)\n", dinfo.vendor, dinfo.vendor_id); printf(" Name: %s\n", dinfo.name); printf(" Version: %s\n", dinfo.version); printf(" Driver version: %s\n", dinfo.driver_version); printf(" OpenCL C version: %s\n", dinfo.opencl_c_version); printf(" Profile: %s\n", dinfo.profile); printf(" Device available: %s\n", dinfo.available ? "yes" : "no"); printf(" Address bits: %u\n", dinfo.address_bits); printf(" Compiler available: %s\n", dinfo.compiler_available ? "yes" : "no"); if (strstr(dinfo.extensions, "cl_khr_fp64") != NULL) printf(" Double FP config: %s\n", dev_fp_config_str(dinfo.double_fp_config)); printf(" Endian: %s\n", dinfo.endian_little ? "little" : "big"); printf(" Error correction support: %s\n", dinfo.error_correction_support ? "yes" : "no"); printf(" Execution capability: %s\n", dev_execution_capabilities_str(dinfo.execution_capabilities)); printf(" Extensions: %s\n", dinfo.extensions); printf(" Global memory cache size: %lu KB\n", dinfo.global_mem_cache_size / 1024); printf(" Global memory cache type: %s\n", dev_mem_cache_type_str(dinfo.global_mem_cache_type)); printf(" Global memory cacheline size: %u\n", dinfo.global_mem_cacheline_size); printf(" Global memory size: %zu MB\n", dinfo.global_mem_size / (1024 * 1024)); if (strstr(dinfo.extensions, "cl_khr_fp16") != NULL) printf(" Half FP config: %s\n", dev_fp_config_str(dinfo.half_fp_config)); printf(" Host unified memory: %s\n", dinfo.host_unified_memory ? "yes" : "no"); printf(" Image support: %s\n", dinfo.image_support ? "yes" : "no"); printf(" Image 2D max size: %lu x %lu\n", dinfo.image2d_max_width, dinfo.image2d_max_height); printf(" Image 3D max size: %lu x %lu x %lu\n", dinfo.image3d_max_width, dinfo.image3d_max_height, dinfo.image3d_max_depth); printf(" Local memory size: %lu\n", dinfo.local_mem_size); printf(" Local memory type: %s\n", dev_local_mem_type_str(dinfo.local_mem_type)); printf(" Max clock frequency: %u\n", dinfo.max_clock_frequency); printf(" Max compute units: %u\n", dinfo.max_compute_units); printf(" Max constant args: %u\n", dinfo.max_constant_args); printf(" Max constant buffer size: %zu\n", dinfo.max_constant_buffer_size); printf(" Max memory allocation size: %zu MB\n", dinfo.max_mem_alloc_size / (1024 * 1024)); printf(" Max parameter size: %zu\n", (cl_ulong)dinfo.max_parameter_size); printf(" Max read image args: %u\n", dinfo.max_read_image_args); printf(" Max samplers: %u\n", dinfo.max_samplers); printf(" Max work-group size: %zu\n", (cl_ulong)dinfo.max_work_group_size); printf(" Max work-item sizes: {%u,%u,%u}\n", (cl_uint) dinfo.max_work_item_sizes[0], (cl_uint) dinfo.max_work_item_sizes[1], (cl_uint) dinfo.max_work_item_sizes[2]); printf(" Max write image args: %u\n", dinfo.max_write_image_args); printf(" Memory base address align: %u\n", dinfo.mem_base_addr_align); printf(" Min data type align size: %u\n", dinfo.min_data_type_align_size); printf(" Native vector width - char: %u\n", dinfo.native_vector_width_char); printf(" Native vector width - short: %u\n", dinfo.native_vector_width_short); printf(" Native vector width - int: %u\n", dinfo.native_vector_width_int); printf(" Native vector width - long: %u\n", dinfo.native_vector_width_long); printf(" Native vector width - float: %u\n", dinfo.native_vector_width_float); if (strstr(dinfo.extensions, "cl_khr_fp64") != NULL) printf(" Native vector width - double: %u\n", dinfo.native_vector_width_double); if (strstr(dinfo.extensions, "cl_khr_fp16") != NULL) printf(" Native vector width - half: %u\n", dinfo.native_vector_width_half); printf(" Preferred vector width - char: %u\n", dinfo.preferred_vector_width_char); printf(" Preferred vector width - short: %u\n", dinfo.preferred_vector_width_short); printf(" Preferred vector width - int: %u\n", dinfo.preferred_vector_width_int); printf(" Preferred vector width - long: %u\n", dinfo.preferred_vector_width_long); printf(" Preferred vector width - float: %u\n", dinfo.preferred_vector_width_float); if (strstr(dinfo.extensions, "cl_khr_fp64") != NULL) printf(" Preferred vector width - double: %u\n", dinfo.preferred_vector_width_double); if (strstr(dinfo.extensions, "cl_khr_fp16") != NULL) printf(" Preferred vector width - half: %u\n", dinfo.preferred_vector_width_half); printf(" Profiling timer resolution: %lu\n", dinfo.profiling_timer_resolution); printf(" Queue properties: %s\n", dev_queue_properties_str(dinfo.queue_properties)); printf(" Sindle FP config: %s\n", dev_fp_config_str(dinfo.single_fp_config)); } }