static int hwloc_opencl_discover(struct hwloc_backend *backend) { struct hwloc_topology *topology = backend->topology; cl_platform_id *platform_ids = NULL; cl_uint nr_platforms; cl_int clret; unsigned j, res = 0; if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) return 0; if (!hwloc_topology_is_thissystem(topology)) { hwloc_debug("%s", "\nno OpenCL detection (not thissystem)\n"); return 0; } clret = clGetPlatformIDs(0, NULL, &nr_platforms); if (CL_SUCCESS != clret || !nr_platforms) return 0; hwloc_debug("%u OpenCL platforms\n", nr_platforms); platform_ids = malloc(nr_platforms * sizeof(*platform_ids)); if (!platform_ids) return 0; clret = clGetPlatformIDs(nr_platforms, platform_ids, &nr_platforms); if (CL_SUCCESS != clret || !nr_platforms) { free(platform_ids); return 0; } for(j=0; j<nr_platforms; j++) { cl_device_id *device_ids = NULL; cl_uint nr_devices; unsigned i; clret = clGetDeviceIDs(platform_ids[j], CL_DEVICE_TYPE_ALL, 0, NULL, &nr_devices); if (CL_SUCCESS != clret) continue; device_ids = malloc(nr_devices * sizeof(*device_ids)); clret = clGetDeviceIDs(platform_ids[j], CL_DEVICE_TYPE_ALL, nr_devices, device_ids, &nr_devices); if (CL_SUCCESS != clret) { free(device_ids); continue; } for(i=0; i<nr_devices; i++) { cl_platform_id platform_id = 0; cl_device_type type; #ifdef CL_DEVICE_TOPOLOGY_AMD cl_device_topology_amd amdtopo; #endif cl_ulong globalmemsize; cl_uint computeunits; hwloc_obj_t osdev, parent; char buffer[64]; hwloc_debug("This is opencl%dd%d\n", j, i); #ifdef CL_DEVICE_TOPOLOGY_AMD clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL); if (CL_SUCCESS != clret) { hwloc_debug("no AMD-specific device information: %d\n", clret); continue; } else if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) { hwloc_debug("AMD-specific device topology reports non-PCIe device type: %u\n", amdtopo.raw.type); continue; } #else continue; #endif osdev = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1); snprintf(buffer, sizeof(buffer), "opencl%dd%d", j, i); osdev->name = strdup(buffer); osdev->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN; osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_COPROC; hwloc_obj_add_info(osdev, "CoProcType", "OpenCL"); hwloc_obj_add_info(osdev, "Backend", "OpenCL"); clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(type), &type, NULL); if (type == CL_DEVICE_TYPE_GPU) hwloc_obj_add_info(osdev, "OpenCLDeviceType", "GPU"); else if (type == CL_DEVICE_TYPE_ACCELERATOR) hwloc_obj_add_info(osdev, "OpenCLDeviceType", "Accelerator"); else if (type == CL_DEVICE_TYPE_CPU) hwloc_obj_add_info(osdev, "OpenCLDeviceType", "CPU"); else if (type == CL_DEVICE_TYPE_CUSTOM) hwloc_obj_add_info(osdev, "OpenCLDeviceType", "Custom"); else hwloc_obj_add_info(osdev, "OpenCLDeviceType", "Unknown"); buffer[0] = '\0'; clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL); if (buffer[0] != '\0') hwloc_obj_add_info(osdev, "GPUVendor", buffer); buffer[0] = '\0'; #ifdef CL_DEVICE_BOARD_NAME_AMD clGetDeviceInfo(device_ids[i], CL_DEVICE_BOARD_NAME_AMD, sizeof(buffer), buffer, NULL); #else clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL); #endif if (buffer[0] != '\0') hwloc_obj_add_info(osdev, "GPUModel", buffer); snprintf(buffer, sizeof(buffer), "%u", j); hwloc_obj_add_info(osdev, "OpenCLPlatformIndex", buffer); buffer[0] = '\0'; clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL); if (CL_SUCCESS == clret) { clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), buffer, NULL); if (buffer[0] != '\0') hwloc_obj_add_info(osdev, "OpenCLPlatformName", buffer); } snprintf(buffer, sizeof(buffer), "%u", i); hwloc_obj_add_info(osdev, "OpenCLPlatformDeviceIndex", buffer); clGetDeviceInfo(device_ids[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(computeunits), &computeunits, NULL); snprintf(buffer, sizeof(buffer), "%u", computeunits); hwloc_obj_add_info(osdev, "OpenCLComputeUnits", buffer); clGetDeviceInfo(device_ids[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalmemsize), &globalmemsize, NULL); snprintf(buffer, sizeof(buffer), "%llu", (unsigned long long) globalmemsize / 1024); hwloc_obj_add_info(osdev, "OpenCLGlobalMemorySize", buffer); parent = NULL; #ifdef CL_DEVICE_TOPOLOGY_AMD parent = hwloc_pci_belowroot_find_by_busid(topology, 0, amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function); if (!parent) parent = hwloc_pci_find_busid_parent(topology, 0, amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function); #endif if (!parent) parent = hwloc_get_root_obj(topology); hwloc_insert_object_by_parent(topology, parent, osdev); res++; } free(device_ids); } free(platform_ids); return res; }
//------------------------------------------------------------------------------ void print_platforms() { cl_uint numPlatforms = 0; cl_platform_id platform = 0; cl_int status = clGetPlatformIDs(0, 0, &numPlatforms); if(status != CL_SUCCESS) { std::cerr << "ERROR - clGetPlatformIDs()" << std::endl; exit(EXIT_FAILURE); } if(numPlatforms < 1) { std::cout << "No OpenCL platform detected" << std::endl; exit(EXIT_SUCCESS); } typedef std::vector< cl_platform_id > PlatformIds; PlatformIds platforms(numPlatforms); status = clGetPlatformIDs(platforms.size(), &platforms[0], 0); if(status != CL_SUCCESS) { std::cerr << "ERROR - clGetPlatformIDs()" << std::endl; exit(EXIT_FAILURE); } std::vector< char > buf(0x10000, char(0)); int p = 0; std::cout << "\n***************************************************\n"; std::cout << "Number of platforms: " << platforms.size() << std::endl; for(PlatformIds::const_iterator i = platforms.begin(); i != platforms.end(); ++i, ++p) { std::cout << "\n-----------\n"; std::cout << "Platform " << p << std::endl; std::cout << "-----------\n"; status = ::clGetPlatformInfo(*i, CL_PLATFORM_VENDOR, buf.size(), &buf[ 0 ], 0 ); if(status != CL_SUCCESS) { std::cerr << "ERROR - clGetPlatformInfo(): " << std::endl; exit(EXIT_FAILURE); } std::cout << "Vendor: " << &buf[ 0 ] << '\n'; status = ::clGetPlatformInfo(*i, CL_PLATFORM_PROFILE, buf.size(), &buf[ 0 ], 0 ); if(status != CL_SUCCESS) { std::cerr << "ERROR - clGetPlatformInfo(): " << std::endl; exit(EXIT_FAILURE); } std::cout << "Profile: " << &buf[ 0 ] << '\n'; status = ::clGetPlatformInfo(*i, CL_PLATFORM_VERSION, buf.size(), &buf[ 0 ], 0 ); if(status != CL_SUCCESS) { std::cerr << "ERROR - clGetPlatformInfo(): " << std::endl; exit(EXIT_FAILURE); } std::cout << "Version: " << &buf[ 0 ] << '\n'; status = ::clGetPlatformInfo(*i, CL_PLATFORM_NAME, buf.size(), &buf[ 0 ], 0 ); if(status != CL_SUCCESS) { std::cerr << "ERROR - clGetPlatformInfo(): " << std::endl; exit(EXIT_FAILURE); } std::cout << "Name: " << &buf[ 0 ] << '\n'; status = ::clGetPlatformInfo(*i, CL_PLATFORM_EXTENSIONS, buf.size(), &buf[ 0 ], 0 ); if(status != CL_SUCCESS) { std::cerr << "ERROR - clGetPlatformInfo(): " << std::endl; exit(EXIT_FAILURE); } std::cout << "Extensions: " << &buf[ 0 ] << '\n'; print_devices(*i); std::cout << "\n===================================================\n"; } }
int main() { int i,j,k; // nb of operations: const int dsize = 512; int nthreads = 1; int nbOfAverages = 1e4; int opsMAC = 2; // operations per MAC cl_float4 *in, *out; cl_float *ck; double tops; //total ops #define NQUEUES 1 cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queues[NQUEUES]; cl_mem bufin, bufck, bufout; cl_event event = NULL; cl_program program; cl_kernel kernel; size_t global[2], local[2]; size_t param[5]; char version[300]; // allocate matrices in = (cl_float4 *) calloc(dsize*dsize, sizeof(*in)); out = (cl_float4 *) calloc(dsize*dsize, sizeof(*out)); ck = (cl_float *) calloc(9*9, sizeof(*ck)); in[0].x = 2.0f; in[1].x = 3.0f; in[dsize].x = 1.0; ck[0] = 1.0f; ck[1] = 0.5f; ck[9] = 0.001f; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); for(i = 0; i < NQUEUES; i++) queues[i] = clCreateCommandQueue( ctx, device, 0, &err ); // Print some info about the system clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL); printf("CL_DEVICE_VERSION=%s\n", version); clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL); printf("CL_DRIVER_VERSION=%s\n", version); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]); j = param[0]; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES="); for(i = 0; i < j; i++) printf("%d ", (int)param[i]); printf("\n"); clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); if(!program) { printf("Error creating program\n"); return -1; } err = clBuildProgram(program, 0, 0, 0, 0, 0); if(err != CL_SUCCESS) { char buffer[20000]; size_t len; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); puts(buffer); return -1; } kernel = clCreateKernel(program, "conv9x9", &err); if(!kernel || err != CL_SUCCESS) { printf("Error creating kernel\n"); return -1; } /* Prepare OpenCL memory objects and place matrices inside them. */ cl_image_format fmt = {CL_RGBA, CL_FLOAT}; cl_int rc; bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck), NULL, &err ); size_t origin[3] = {0,0,0}; size_t region[3] = {dsize, dsize, 1}; err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL ); clSetKernelArg(kernel, 0, sizeof(int), &dsize); clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin); clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck); clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout); local[0] = 8; local[1] = 8; global[0] = global[1] = dsize-32; usleep(100000); struct timeval start,end; gettimeofday(&start, NULL); for (k=0; k<nthreads; k++) { //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); for(i=0;i<nbOfAverages;i++) { // do the 2D convolution err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("clEnqueueNDRangeKernel error %d\n", err); return -1; } } } clFinish(queues[0]); gettimeofday(&end, NULL); double t = ((double) (end.tv_sec - start.tv_sec)) + ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified! /* Wait for calculations to be finished. */ /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL ); clFinish(queues[0]); printf("%f %f %f %f\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x); /* Release OpenCL memory objects. */ clReleaseMemObject( bufin ); clReleaseMemObject( bufck ); clReleaseMemObject( bufout ); /* Release OpenCL working objects. */ for(i = 0; i < NQUEUES; i++) clReleaseCommandQueue( queues[i] ); clReleaseContext( ctx ); // report performance: tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads); printf("\nTime in s: %lf:", t); printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9); printf("\n"); return(0); }
void setup_opencl(const char* cl_source_filename, const char* cl_source_main, cl_device_id* device_id, cl_kernel* kernel, cl_context* context, cl_command_queue* queue) { cl_int err; // error code returned from api calls cl_platform_id platform_id; // compute device id cl_program program; // compute program cl_device_id devices[MAX_RESOURCES]; cl_platform_id platforms[MAX_RESOURCES]; unsigned int best_platform = 0; unsigned int best_device = 0; print_devices(0); if(!get_best_device(&best_platform, &best_device)) { printf("No suitable device was found! Try using an OpenCL1.1 compatible device.\n"); exit(1); } printf("Initiating platform-%d device-%d.\n", best_platform, best_device); // Platform err = clGetPlatformIDs(MAX_RESOURCES, platforms, NULL); ocl_error("Getting platform id", err); platform_id = platforms[best_platform]; // Device err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, sizeof(devices), devices, NULL); //NULL, ignore number returned devices. ocl_error("Getting device ids", err); *device_id = devices[best_device]; // Context *context = clCreateContext(0, 1, device_id, NULL, NULL, &err); ocl_error("Creating context", err); // Command-queue *queue = clCreateCommandQueue(*context, *device_id, 0, &err); ocl_error("Creating command queue", err); // Read .cl source into memory int cl_source_len = 0; char* cl_source = file_contents(cl_source_filename, &cl_source_len); // Create thes compute program from the source buffer program = clCreateProgramWithSource(*context, 1, (const char **) &cl_source, NULL, &err); ocl_error("Failed to create compute program", err); // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { char* build_log; size_t log_size; // First call to know the proper size clGetProgramBuildInfo(program, *device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); build_log = malloc(sizeof(char)*(log_size+1)); if(log_size > 0 && build_log != NULL) { // Second call to get the log clGetProgramBuildInfo(program, *device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); build_log[log_size] = '\0'; printf("%s\n", build_log); free(build_log); } exit(err); } // Create the compute kernel in the program we wish to run *kernel = clCreateKernel(program, cl_source_main, &err); ocl_error("Failed to create compute kernel", err); }
PIGLIT_CL_API_TEST_CONFIG_END enum piglit_result piglit_cl_test(const int argc, const char** argv, const struct piglit_cl_api_test_config* config, const struct piglit_cl_api_test_env* env) { enum piglit_result result = PIGLIT_PASS; int i; cl_int errNo; cl_uint num_platforms; cl_platform_id* platforms = NULL; /*** Normal usage ***/ /* get number of platforms */ errNo = clGetPlatformIDs(0, NULL, &num_platforms); if(!piglit_cl_check_error(errNo, CL_SUCCESS)) { piglit_cl_check_error(errNo, CL_SUCCESS); fprintf(stderr, "Failed (error code: %s): Get size of platform list.\n", piglit_cl_get_error_name(errNo)); piglit_merge_result(&result, PIGLIT_FAIL); } else { /* * Get platform list. * Try returning from 1 to num_platforms platforms. */ for(i = 1; i <= num_platforms; i++) { platforms = malloc(i * sizeof(cl_platform_id)); errNo = clGetPlatformIDs(i, platforms, NULL); if(!piglit_cl_check_error(errNo, CL_SUCCESS)) { fprintf(stderr, "Failed (error code: %s): Get platform list.\n", piglit_cl_get_error_name(errNo)); piglit_merge_result(&result, PIGLIT_FAIL); } free(platforms); } } /*** Errors ***/ /* * CL_INVALID_VALUE if num_entries is equal * to zero and platforms is not NULL, or if both num_platforms * and platforms are NULL. */ errNo = clGetPlatformIDs(0, platforms, NULL); if(!piglit_cl_check_error(errNo, CL_INVALID_VALUE)) { fprintf(stderr, "Failed (error code: %s): Trigger CL_INVALID_VALUE if num_entries is equeal to zero and platforms is not NULL.\n", piglit_cl_get_error_name(errNo)); piglit_merge_result(&result, PIGLIT_FAIL); } errNo = clGetPlatformIDs(100, NULL, NULL); if(!piglit_cl_check_error(errNo, CL_INVALID_VALUE)) { fprintf(stderr, "Failed (error code: %s): Trigger CL_INVALID_VALUE if both num_platforms and platforms are NULL.\n", piglit_cl_get_error_name(errNo)); piglit_merge_result(&result, PIGLIT_FAIL); } return result; }
bool Filter::initCL(const Params& params, const char *source, const char *options) { // Ensure no existing context releaseCL(); cl_int err; cl_uint numPlatforms, numDevices; cl_platform_id platform, platforms[params.platformIndex+1]; err = clGetPlatformIDs(params.platformIndex+1, platforms, &numPlatforms); CHECK_ERROR_OCL(err, "getting platforms", return false); if (params.platformIndex >= numPlatforms) { reportStatus("Platform index %d out of range (%d platforms found)", params.platformIndex, numPlatforms); return false; } platform = platforms[params.platformIndex]; cl_device_id devices[params.deviceIndex+1]; err = clGetDeviceIDs(platform, params.type, params.deviceIndex+1, devices, &numDevices); CHECK_ERROR_OCL(err, "getting devices", return false); if (params.deviceIndex >= numDevices) { reportStatus("Device index %d out of range (%d devices found)", params.deviceIndex, numDevices); return false; } m_device = devices[params.deviceIndex]; char name[64]; clGetDeviceInfo(m_device, CL_DEVICE_NAME, 64, name, NULL); reportStatus("Using device: %s", name); m_context = clCreateContext(NULL, 1, &m_device, NULL, NULL, &err); CHECK_ERROR_OCL(err, "creating context", return false); m_queue = clCreateCommandQueue(m_context, m_device, CL_QUEUE_PROFILING_ENABLE, &err); CHECK_ERROR_OCL(err, "creating command queue", return false); m_program = clCreateProgramWithSource(m_context, 1, &source, NULL, &err); CHECK_ERROR_OCL(err, "creating program", return false); err = clBuildProgram(m_program, 1, &m_device, options, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { size_t sz; clGetProgramBuildInfo( m_program, m_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &sz); char *log = (char*)malloc(++sz); clGetProgramBuildInfo( m_program, m_device, CL_PROGRAM_BUILD_LOG, sz, log, NULL); reportStatus(log); free(log); } CHECK_ERROR_OCL(err, "building program", return false); reportStatus("OpenCL context initialised."); return true; }
int main(int argc, char *argv[]) { cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel_one, kernel_path; cl_mem d_mt_state, d_mt_emit, d_max_prob_old; cl_mem d_max_prob_new, d_path, v_prob, v_path; int wg_size = 256; int n_state = 256*16; int n_emit = 128; int n_obs = 100; size_t init_prob_size = sizeof(float) * n_state; size_t mt_state_size = sizeof(float) * n_state * n_state; size_t mt_emit_size = sizeof(float) * n_emit * n_state; float *init_prob = (float *) malloc(init_prob_size); float *mt_state = (float *) malloc(mt_state_size); float *mt_emit = (float *) malloc(mt_emit_size); int *obs = (int *) malloc(sizeof(int) * n_obs); int *viterbi_gpu = (int *) malloc(sizeof(int) * n_obs); srand(2012); initHMM(init_prob, mt_state, mt_emit, n_state, n_emit); int i; for (i = 0; i < n_obs; i++) { obs[i] = i % 15; } const char *source = load_program_source("Viterbi.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = "-cl-fast-relaxed-math"; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); /* char tmp[102400]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(tmp), tmp, NULL); printf("error %s\n", tmp); */ kernel_one = clCreateKernel(program, "ViterbiOneStep", &err); printf("kernel %p err %d\n", kernel_one, err); kernel_path = clCreateKernel(program, "ViterbiPath", &err); printf("kernel %p err %d\n", kernel_path, err); d_mt_state = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_state_size, NULL, &err); printf("buffer %p\n", d_mt_state); d_mt_emit = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_emit_size, NULL, &err); printf("buffer %p\n", d_mt_emit); d_max_prob_new = clCreateBuffer(context, CL_MEM_READ_WRITE, init_prob_size, NULL, &err); printf("buffer %p\n", d_max_prob_new); d_max_prob_old = clCreateBuffer(context, CL_MEM_READ_WRITE, init_prob_size, NULL, &err); printf("buffer %p\n", d_max_prob_old); d_path = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*(n_obs-1)*n_state, NULL, &err); printf("buffer %p\n", d_path); v_prob = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, &err); printf("buffer %p\n", v_prob); v_path = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*n_obs, NULL, &err); printf("buffer %p\n", v_prob); err = clEnqueueWriteBuffer(queue, d_mt_state, CL_TRUE, 0, mt_state_size, mt_state, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueWriteBuffer(queue, d_mt_emit, CL_TRUE, 0, mt_emit_size, mt_emit, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueWriteBuffer(queue, d_max_prob_old, CL_TRUE, 0, init_prob_size, init_prob, 0, NULL, NULL); printf("err %d\n", err); // max_wg_size is 1024 for Intel Core 2 CPU size_t max_wg_size; err = clGetKernelWorkGroupInfo(kernel_one, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL); printf("max_wg_size %d\n", max_wg_size); size_t local_work_size[2], global_work_size[2]; local_work_size[0] = wg_size; local_work_size[1] = 1; global_work_size[0] = local_work_size[0] * 256; global_work_size[1] = n_state/256; for (i = 1; i < n_obs; i++) { err = clSetKernelArg(kernel_one, 0, sizeof(cl_mem), (void*)&d_max_prob_new); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 1, sizeof(cl_mem), (void*)&d_path); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 2, sizeof(cl_mem), (void*)&d_max_prob_old); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 3, sizeof(cl_mem), (void*)&d_mt_state); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 4, sizeof(cl_mem), (void*)&d_mt_emit); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 5, sizeof(float)*local_work_size[0], NULL); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 6, sizeof(int)*local_work_size[0], NULL); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 7, sizeof(int), (void*)&n_state); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 8, sizeof(int), (void*)&(obs[i])); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 9, sizeof(int), (void*)&i); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel_one, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueCopyBuffer(queue, d_max_prob_new, d_max_prob_old, 0, 0, sizeof(float)*n_state, 0, NULL, NULL); printf("err %d\n", err); } local_work_size[0] = 1; global_work_size[0] = 1; err = clSetKernelArg(kernel_path, 0, sizeof(cl_mem), (void*)&v_prob); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 1, sizeof(cl_mem), (void*)&v_path); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 2, sizeof(cl_mem), (void*)&d_max_prob_new); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 3, sizeof(cl_mem), (void*)&d_path); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 4, sizeof(int), (void*)&n_state); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 5, sizeof(int), (void*)&n_obs); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel_path, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); clFinish(queue); printf("finish done\n"); err = clEnqueueReadBuffer(queue, v_path, CL_TRUE, 0, sizeof(int)*n_obs, viterbi_gpu, 0, NULL, NULL); printf("err %d\n", err); for (i = 0; i < n_obs; i++) { printf("%d %d\n", i, viterbi_gpu[i]); } clReleaseMemObject(d_mt_state); clReleaseMemObject(d_mt_emit); clReleaseMemObject(d_max_prob_old); clReleaseMemObject(d_max_prob_new); clReleaseMemObject(d_path); clReleaseMemObject(v_prob); clReleaseMemObject(v_path); clReleaseProgram(program); clReleaseKernel(kernel_one); clReleaseKernel(kernel_path); clReleaseCommandQueue(queue); }
static int SetupComputeDevices(int gpu) { int err; size_t returned_size; ComputeDeviceType = gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU; #if (USE_GL_ATTACHMENTS) printf(SEPARATOR); printf("Using active OpenGL context...\n"); // Bind to platform cl_platform_id platform_id; cl_uint numPlatforms; cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { printf("clGetPlatformIDs Failed\n"); return EXIT_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = (cl_platform_id*)calloc(numPlatforms, sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); char platformName[100]; for (unsigned i = 0; i < numPlatforms; ++i) { status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(platformName), platformName, NULL); platform_id = platforms[i]; if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) { break; } } printf("Platform found : %s\n", platformName); free(platforms); } if(NULL == platform_id) { printf("NULL platform found so Exiting Application.\n"); return EXIT_FAILURE; } // Get ID for the device err = clGetDeviceIDs(platform_id, ComputeDeviceType, 1, &ComputeDeviceId, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to locate compute device!\n"); return EXIT_FAILURE; } // Create a context cl_context_properties properties[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)(platform_id), 0 }; // Create a context from a CGL share group // ComputeContext = clCreateContext(properties, 1, &ComputeDeviceId, NULL, 0, 0); if (!ComputeContext) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } #else // Bind to platform cl_platform_id platform_id; cl_uint numPlatforms; cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { printf("clGetPlatformIDs Failed\n"); return EXIT_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = (cl_platform_id*)calloc(numPlatforms, sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); char platformName[100]; for (unsigned i = 0; i < numPlatforms; ++i) { status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(platformName), platformName, NULL); platform_id = platforms[i]; if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) { break; } } printf("Platform found : %s\n", platformName); free(platforms); } if(NULL == platform_id) { printf("NULL platform found so Exiting Application.\n"); return EXIT_FAILURE; } // Get ID for the device err = clGetDeviceIDs(platform_id, ComputeDeviceType, 1, &ComputeDeviceId, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to locate compute device!\n"); return EXIT_FAILURE; } // Create a context containing the compute device(s) // ComputeContext = clCreateContext(0, 1, &ComputeDeviceId, NULL, NULL, &err); if (!ComputeContext) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } #endif unsigned int device_count; cl_device_id device_ids[16]; err = clGetContextInfo(ComputeContext, CL_CONTEXT_DEVICES, sizeof(device_ids), device_ids, &returned_size); if(err) { printf("Error: Failed to retrieve compute devices for context!\n"); return EXIT_FAILURE; } device_count = returned_size / sizeof(cl_device_id); unsigned int i = 0; int device_found = 0; cl_device_type device_type; for(i = 0; i < device_count; i++) { clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); if(device_type == ComputeDeviceType) { ComputeDeviceId = device_ids[i]; device_found = 1; break; } } if(!device_found) { printf("Error: Failed to locate compute device!\n"); return EXIT_FAILURE; } // Create a command queue // ComputeCommands = clCreateCommandQueue(ComputeContext, ComputeDeviceId, 0, &err); if (!ComputeCommands) { printf("Error: Failed to create a command queue!\n"); return EXIT_FAILURE; } // Report the device vendor and device name // cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err = clGetDeviceInfo(ComputeDeviceId, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err|= clGetDeviceInfo(ComputeDeviceId, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve device info!\n"); return EXIT_FAILURE; } printf(SEPARATOR); printf("Connecting to %s %s...\n", vendor_name, device_name); return CL_SUCCESS; }
int main(int argc, char*argv[]) { if (argc != 4) { printf("Usage: %s #m #n #k\n", argv[0]); exit(1); } int *m,*n,*k,i; m = (int *) malloc(sizeof(int)); n = (int *) malloc(sizeof(int)); k = (int *) malloc(sizeof(int)); //Initilizing the matrix dimensions m[0] = atoi(argv[1]); n[0] = atoi(argv[2]); k[0] = atoi(argv[3]); double time = 0; clock_t begin = clock(); cl_device_id deviceId = NULL; cl_context context = NULL; cl_command_queue commandQueue = NULL; double *alpha, *beta; //allocating memory for the vectors alpha = (double *) malloc(sizeof(double)); beta = (double *) malloc(sizeof(double)); double *A, *B, *C; A = (double *) malloc(m[0]*k[0]*sizeof(double)); B = (double *) malloc(k[0]*n[0]*sizeof(double)); C = (double *) malloc(m[0]*n[0]*sizeof(double)); //initializing values of alpha and beta alpha[0] = 1.0; beta[0] = 0.0; clock_t end = clock(); time += (double)(end - begin) * 1000 / CLOCKS_PER_SEC; //printf (" Intializing matrix data \n\n"); begin = clock(); for (i = 0; i < (m[0]*k[0]); i++) { A[i] = (double)(i+1); } for (i = 0; i < (k[0]*n[0]); i++) { B[i] = (double)(-i-1); } for (i = 0; i < (m[0]*n[0]); i++) { C[i] = 0.0; } //Memory objects for kernel parameters cl_mem AMemobj = NULL; cl_mem BMemobj = NULL; cl_mem CMemobj = NULL; cl_mem mMemobj = NULL; cl_mem nMemobj = NULL; cl_mem kMemobj = NULL; cl_mem alphaMemobj = NULL; cl_mem betaMemobj = NULL; //Some opencl objects cl_program program = NULL; cl_kernel kernel = NULL; cl_platform_id platformId = NULL; cl_uint numDevices; cl_uint numPlatforms; cl_int ret; size_t contextDescriptorSize; //reading kernel from file FILE *file; char fileName[] = "./dgemm.cl"; char *kernelSource; size_t sourceSize; //Load the source code from file file = fopen(fileName, "r"); if(!file) { printf("Failed to load the kernel file. \n"); exit(1); } kernelSource = (char *) malloc(SOURCE_SIZE_MAX); sourceSize = fread(kernelSource, 1, SOURCE_SIZE_MAX, file); fclose(file); //Get platform information ret = clGetPlatformIDs(1, &platformId, &numPlatforms); //get list of devices ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_DEFAULT, 1, &deviceId, &numDevices); //create opencl device context context = clCreateContext(NULL, 1, &deviceId, NULL, NULL, &ret); //get device context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, 0, &contextDescriptorSize); //command queue for the first device commandQueue = clCreateCommandQueue(context, deviceId, 0, &ret); //Create memory object AMemobj = clCreateBuffer(context, CL_MEM_READ_ONLY, m[0]*k[0] * sizeof(double), A, &ret); BMemobj = clCreateBuffer(context, CL_MEM_READ_ONLY, k[0]*n[0] * sizeof(double), B, &ret); CMemobj = clCreateBuffer(context, CL_MEM_READ_WRITE, m[0]*n[0] * sizeof(double), C, &ret); mMemobj = clCreateBuffer(context, CL_MEM_READ_ONLY, 1 * sizeof(int), m, &ret); nMemobj = clCreateBuffer(context, CL_MEM_READ_ONLY, 1 * sizeof(int), n, &ret); kMemobj = clCreateBuffer(context, CL_MEM_READ_ONLY, 1 * sizeof(int), k, &ret); alphaMemobj = clCreateBuffer(context, CL_MEM_READ_ONLY, 1 * sizeof(double), alpha, &ret); betaMemobj = clCreateBuffer(context, CL_MEM_READ_ONLY, 1 * sizeof(double), beta, &ret); //Create kernel program from source program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource,(const size_t *)&sourceSize, &ret); //Write data to the buffer ret = clEnqueueWriteBuffer(commandQueue, AMemobj, CL_TRUE, 0, sizeof(double) * m[0]*k[0], A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(commandQueue, BMemobj, CL_TRUE, 0, sizeof(double) * k[0]*n[0], B, 0, NULL, NULL); ret = clEnqueueWriteBuffer(commandQueue, CMemobj, CL_TRUE, 0, sizeof(double) * m[0]*n[0], C, 0, NULL, NULL); ret = clEnqueueWriteBuffer(commandQueue, mMemobj, CL_TRUE, 0, 1 * sizeof(int), m, 0, NULL, NULL); ret = clEnqueueWriteBuffer(commandQueue, nMemobj, CL_TRUE, 0, 1 * sizeof(int), n, 0, NULL, NULL); ret = clEnqueueWriteBuffer(commandQueue, kMemobj, CL_TRUE, 0, 1 * sizeof(int), k, 0, NULL, NULL); ret = clEnqueueWriteBuffer(commandQueue, alphaMemobj, CL_TRUE, 0, 1 * sizeof(double), alpha, 0, NULL, NULL); ret = clEnqueueWriteBuffer(commandQueue, betaMemobj, CL_TRUE, 0, 1 * sizeof(double), beta, 0, NULL, NULL); //Build the kernel program ret = clBuildProgram(program, 1, &deviceId, "-I ./", NULL, NULL); //Create a opencl kernel kernel = clCreateKernel(program, "dgemm", &ret); //Pass arguments to kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&AMemobj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&BMemobj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&CMemobj); ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&mMemobj); ret = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&nMemobj); ret = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&kMemobj); ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&alphaMemobj); ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&betaMemobj); // Execute OpenCL kernel in data parallel const int TS = 32; const size_t local[2] = { TS, TS }; const size_t global[2] = { (int)(pow(2,ceil(log(m[0])/log(2)))), (int)(pow(2,ceil(log(n[0])/log(2)))) }; //Execute opencl kernel ret = clEnqueueNDRangeKernel( commandQueue, kernel, 2, NULL, global, local, 0, 0, 0 ); //copy the result back ret = clEnqueueReadBuffer(commandQueue, CMemobj, CL_TRUE, 0, m[0]*n[0]*sizeof(double), C, 0, NULL, NULL); //Before program termination ret = clFlush(commandQueue); ret = clFinish(commandQueue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(AMemobj); ret = clReleaseMemObject(BMemobj); ret = clReleaseMemObject(CMemobj); ret = clReleaseMemObject(mMemobj); ret = clReleaseMemObject(nMemobj); ret = clReleaseMemObject(kMemobj); ret = clReleaseMemObject(alphaMemobj); ret = clReleaseMemObject(betaMemobj); ret = clReleaseCommandQueue(commandQueue); ret = clReleaseContext(context); free(kernelSource); free(A); free(B); free(C); free(m); free(n); free(k); free(alpha); free(beta); //Printing timing result end = clock(); time += (double)(end - begin) *1000 / CLOCKS_PER_SEC; printf("%lf\n", time); return 0; }
void OCL_base::init_kernel(const char* kernel_source, const char* kernel_name, std::string define_statements, bool compile_source) { if(!compile_source){ Rprintf("Binary sources not supported yet\n"); return; } char* kernel_buffer = 0; size_t k_buffer_size=0; // if the kernel source has been predefined in "../oCL_Kernels/oCL_Kernels.h" // then use that. if(!strcmp(kernel_source, "move_deltoids")){ // strcmp returns 0 for equal kernel_buffer = make_source(define_statements, move_deltoids, k_buffer_size); } if(!strcmp(kernel_source, "move_deltoids_2")){ kernel_buffer = make_source(define_statements, move_deltoids_2, k_buffer_size); } if(!strcmp(kernel_source, "move_deltoids_dummy")){ kernel_buffer = make_source(define_statements, move_deltoids_dummy, k_buffer_size); } // if kernel buffer is not defined yet, try to read it from a file if(!kernel_buffer){ std::ifstream in(kernel_source, std::ios::binary); if(!in){ Rprintf("Unable to open kernel source file\n"); return; } in.seekg(0, std::ios::end); ssize_t end_pos = in.tellg(); Rprintf("in.tellg() reports : %d\n", end_pos); in.seekg(0, std::ios::beg); if(!end_pos){ Rprintf("Unable to read from kernel source file\n"); return; } // prepend #define statements to the kernal source. // add one extra \n to the k_buffer_size = 1 + define_statements.size() + (size_t)end_pos + 1; kernel_buffer = new char[ k_buffer_size ]; memset((void*)kernel_buffer, 0, sizeof(char) * k_buffer_size); // kernel_buffer[k_buffer_size] = 0; // this may not be needed. size_t copied_bytes = define_statements.copy(kernel_buffer, define_statements.size()); if(copied_bytes != define_statements.size()){ Rprintf("Unable to copy the full define statements\n"); delete []kernel_buffer; return; } kernel_buffer[ define_statements.size() ] = '\n'; // add a new line for safety in.read((kernel_buffer + 1 + define_statements.size()), end_pos); if(in.gcount() != end_pos){ Rprintf("Unable to read to end of kernel source file: %d != %d\n", end_pos, in.gcount()); delete []kernel_buffer; return; } } // at this point kernel_buffer should be defined. // and we can use it to compile the kernel. cl_int ret = 0; // return value. Use the same for all. ret = clGetPlatformIDs(1, &platform_id, &num_platforms); if(ret) report_error_pf("clGetPlatformIDs", ret); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices ); if(ret) report_error_pf("clGetDeviceIDs", ret); if(ret != CL_SUCCESS){ Rprintf("clGetDevice returned with error: %d\n", (int)ret); return; } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); report_error_pf("clCreateContext", ret); command_que = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); report_error_pf("clCreateCommandQueue", ret); program = clCreateProgramWithSource(context, 1, (const char**)&kernel_buffer, (const size_t*)&k_buffer_size, &ret); report_error_pf("clCreateProgramWithSource", ret); clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); report_error_pf("clBuildProgram", ret); char* build_log = NULL; size_t log_size = 1000; ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, build_log, &log_size); report_error_pf("clGetProgramBuildInfo", ret); build_log = new char[log_size+1]; ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); report_error_pf("clGetProgramBuildInfo", ret); // the below doesn't make much sense. But log_size is never 0, so, need to do something Rprintf("After building the kernel the log size is : %d\n", log_size); if(log_size > 2){ Rprintf("clBuildProgram Error encountered:\n%s\n", build_log); Rprintf(".....\n%s\n....\n", kernel_buffer); } delete []build_log; kernel = clCreateKernel(program, kernel_name, &ret); report_error_pf("clCreateKernel", ret); delete []kernel_buffer; }
LIBSTDCL_API CONTEXT* clcontext_create( const char* platform_name, int devtyp, size_t ndevmax, cl_context_properties* ctxprop_ext, int lock_key ) { int n; int err = 0; int i; size_t devlist_sz; CONTEXT* cp = 0; cl_platform_id* platforms = 0; cl_uint nplatforms; char info[1024]; cl_platform_id platformid; int nctxprop = 0; cl_context_properties* ctxprop; size_t sz; cl_uint ndev = 0; cl_command_queue_properties prop = 0; DEBUG(__FILE__,__LINE__,"clcontext_create() called"); // if (ndevmax) // WARN(__FILE__,__LINE__,"__clcontext_create(): ndevmax argument ignored"); /*** *** allocate CONTEXT struct ***/ DEBUG(__FILE__,__LINE__, "clcontext_create: sizeof CONTEXT %d",sizeof(CONTEXT)); // cp = (CONTEXT*)malloc(sizeof(CONTEXT)); assert(sizeof(CONTEXT)<getpagesize()); #ifdef _WIN64 cp = (CONTEXT*)_aligned_malloc(sizeof(CONTEXT),getpagesize()); if (!cp) { WARN(__FILE__,__LINE__,"memalign failed"); } #else if (posix_memalign((void**)&cp,getpagesize(),sizeof(CONTEXT))) { WARN(__FILE__,__LINE__,"posix_memalign failed"); } #endif DEBUG(__FILE__,__LINE__,"clcontext_create: context_ptr=%p",cp); if ((intptr_t)cp & (getpagesize()-1)) { ERROR(__FILE__,__LINE__, "clcontext_create: fatal error: unaligned context_ptr"); exit(-1); } if (!cp) { errno=ENOMEM; return(0); } /*** *** get platform id ***/ clGetPlatformIDs(0,0,&nplatforms); // printf("XXX %d\n",nplatforms); if (nplatforms) { platforms = (cl_platform_id*)malloc(nplatforms*sizeof(cl_platform_id)); clGetPlatformIDs(nplatforms,platforms,0); for(i=0;i<nplatforms;i++) { char info[1024]; DEBUG(__FILE__,__LINE__,"_libstdcl_init: available platform:"); clGetPlatformInfo(platforms[i],CL_PLATFORM_PROFILE,1024,info,0); DEBUG(__FILE__,__LINE__, "_libstdcl_init: [%p]CL_PLATFORM_PROFILE=%s",platforms[i],info); clGetPlatformInfo(platforms[i],CL_PLATFORM_VERSION,1024,info,0); DEBUG(__FILE__,__LINE__, "_libstdcl_init: [%p]CL_PLATFORM_VERSION=%s",platforms[i],info); clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,1024,info,0); DEBUG(__FILE__,__LINE__, "_libstdcl_init: [%p]CL_PLATFORM_NAME=%s",platforms[i],info); clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR,1024,info,0); DEBUG(__FILE__,__LINE__, "_libstdcl_init: [%p]CL_PLATFORM_VENDOR=%s",platforms[i],info); clGetPlatformInfo(platforms[i],CL_PLATFORM_EXTENSIONS,1024,info,0); DEBUG(__FILE__,__LINE__, "_libstdcl_init: [%p]CL_PLATFORM_EXTENSIONS=%s",platforms[i],info); } } else { WARN(__FILE__,__LINE__, "_libstdcl_init: no platforms found, continue and hope for the best"); } platformid = __get_platformid(nplatforms, platforms, platform_name); DEBUG(__FILE__,__LINE__,"clcontext_create: platformid=%p",platformid); /*** *** create context ***/ while (ctxprop_ext != 0 && ctxprop_ext[nctxprop] != 0) ++nctxprop; // cl_context_properties ctxprop[3] = { // (cl_context_properties)CL_CONTEXT_PLATFORM, // (cl_context_properties)platformid, // (cl_context_properties)0 // }; nctxprop += 3; ctxprop = (cl_context_properties*)malloc(nctxprop*sizeof(cl_context_properties)); ctxprop[0] = (cl_context_properties)CL_CONTEXT_PLATFORM; ctxprop[1] = (cl_context_properties)platformid; for(i=0;i<nctxprop-3;i++) ctxprop[2+i] = ctxprop_ext[i]; ctxprop[nctxprop-1] = (cl_context_properties)0; clGetPlatformInfo(platformid,CL_PLATFORM_PROFILE,0,0,&sz); cp->platform_profile = (char*)malloc(sz); clGetPlatformInfo(platformid,CL_PLATFORM_PROFILE,sz,cp->platform_profile,0); clGetPlatformInfo(platformid,CL_PLATFORM_VERSION,0,0,&sz); cp->platform_version = (char*)malloc(sz); clGetPlatformInfo(platformid,CL_PLATFORM_VERSION,sz,cp->platform_version,0); clGetPlatformInfo(platformid,CL_PLATFORM_NAME,0,0,&sz); cp->platform_name = (char*)malloc(sz); clGetPlatformInfo(platformid,CL_PLATFORM_NAME,sz,cp->platform_name,0); clGetPlatformInfo(platformid,CL_PLATFORM_VENDOR,0,0,&sz); cp->platform_vendor = (char*)malloc(sz); clGetPlatformInfo(platformid,CL_PLATFORM_VENDOR,sz,cp->platform_vendor,0); clGetPlatformInfo(platformid,CL_PLATFORM_EXTENSIONS,0,0,&sz); cp->platform_extensions = (char*)malloc(sz); clGetPlatformInfo(platformid,CL_PLATFORM_EXTENSIONS,sz, cp->platform_extensions,0); #ifdef _WIN64 cp->ctx = clCreateContextFromType(ctxprop,devtyp,0,0,&err); #else if (lock_key > 0) { if (ndevmax == 0) ndevmax = 1; cl_uint platform_ndev; err = clGetDeviceIDs(platformid,devtyp,0,0,&platform_ndev); // cl_uint platform_vndev = 2*platform_ndev; cl_uint platform_vndev = platform_ndev; //DEBUG(__FILE__,__LINE__,"%d %d",platform_ndev,platform_vndev); cl_device_id* platform_dev = (cl_device_id*)malloc(platform_ndev*sizeof(cl_device_id)); err = clGetDeviceIDs(platformid,devtyp,platform_ndev,platform_dev,0); DEBUG(__FILE__,__LINE__,"clcontext_create: lock_key=%d",lock_key); pid_t pid = getpid(); size_t sz_page = getpagesize(); // system("ls /dev/shm"); char shmobj[64]; snprintf(shmobj,64,"/stdcl_ctx_lock%d.%d",devtyp,lock_key); DEBUG(__FILE__,__LINE__, "clcontext_create: attempt master shm_open %s from %d",shmobj,pid); int fd = shm_open(shmobj,O_RDWR|O_CREAT|O_EXCL,0); void* p0; struct timeval t0,t1; int timeout = 0; int noff = 0; if (fd < 0) { DEBUG(__FILE__,__LINE__, "clcontext_create: master shm_open failed from %d (%d)",pid,fd); DEBUG(__FILE__,__LINE__, "clcontext_create: attempt slave shm_open from %d",pid); timeout = 0; gettimeofday(&t0,0); t0.tv_sec += 10; do { fd = shm_open(shmobj,O_RDWR,0); gettimeofday(&t1,0); if (t1.tv_sec > t0.tv_sec && t1.tv_usec > t0.tv_usec) timeout = 1; } while (fd < 0 && !timeout); if (timeout) { ERROR(__FILE__,__LINE__,"clcontext_create: shm_open timeout"); } ftruncate(fd,sz_page); p0 = mmap(0,sz_page,PROT_READ|PROT_WRITE,MAP_SHARED,fd,0); if (!p0) return(0); __ctx_lock = (struct __ctx_lock_struct*)p0; pthread_mutex_lock(&__ctx_lock->mtx); // if (__ctx_lock->refc < platform_ndev) { if (__ctx_lock->refc < platform_vndev) { noff = __ctx_lock->refc; // ndev = min(ndevmax,platform_ndev-noff); ndev = min(ndevmax,platform_vndev-noff); __ctx_lock->refc += ndev; } pthread_mutex_unlock(&__ctx_lock->mtx); close(fd); } else { DEBUG(__FILE__,__LINE__, "clcontext_create: master shm_open succeeded from %d",pid); ftruncate(fd,sz_page); p0 = mmap(0,sz_page,PROT_READ|PROT_WRITE,MAP_SHARED,fd,0); if (!p0) return(0); __ctx_lock = (struct __ctx_lock_struct*)p0; __ctx_lock->magic = 20110415; __ctx_lock->key = lock_key; pthread_mutex_init(&__ctx_lock->mtx,0); // ndev = min(ndevmax,platform_ndev); ndev = min(ndevmax,platform_vndev); DEBUG(__FILE__,__LINE__,"ndev=%d %d %d",ndev,ndevmax,platform_vndev); __ctx_lock->refc = ndev; fchmod(fd,S_IRUSR|S_IWUSR); close(fd); } DEBUG(__FILE__,__LINE__,"ndev=%d",ndev); // if (noff < platform_ndev) { if (noff < platform_vndev) { // cp->ctx = clCreateContext(ctxprop,ndev,platform_dev + noff,0,0,&err); cp->ctx = clCreateContext(ctxprop,ndev,platform_dev + noff%platform_ndev,0,0,&err); DEBUG(__FILE__,__LINE__, "clcontext_create: platform_ndev=%d ndev=%d noffset=%d", platform_ndev,ndev,noff); if (platform_dev) free(platform_dev); } else { cp->ctx = 0; } } else { cp->ctx = clCreateContextFromType(ctxprop,devtyp,0,0,&err); } #endif if (cp->ctx) { cp->devtyp = devtyp; err = clGetContextInfo(cp->ctx,CL_CONTEXT_DEVICES,0,0,&devlist_sz); cp->ndev = devlist_sz/sizeof(cl_device_id); cp->dev = (cl_device_id*)malloc(10*devlist_sz); err=clGetContextInfo(cp->ctx,CL_CONTEXT_DEVICES,devlist_sz,cp->dev,0); // cp->devtyp = devtyp; // err = clGetDeviceIDs(platformid,devtyp,0,0,&(cp->ndev)); // DEBUG(__FILE__,__LINE__,"xxx %d",err); // DEBUG(__FILE__,__LINE__,"number of devices %d",cp->ndev); // cp->dev = (cl_device_id*)malloc(cp->ndev * sizeof(cl_device_id) ); // err = clGetDeviceIDs(platformid,devtyp,cp->ndev,cp->dev,&(cp->ndev)); // DEBUG(__FILE__,__LINE__,"xxx %d",err); // DEBUG(__FILE__,__LINE__," %p device[0]",cp->dev[0]); } else { WARN(__FILE__,__LINE__,"clcontext_create: failed"); #ifndef _WIN64 if (lock_key > 0 && ndev > 0) { pthread_mutex_lock(&__ctx_lock->mtx); __ctx_lock->refc -= ndev; pthread_mutex_unlock(&__ctx_lock->mtx); } free(cp); #else _aligned_free(cp); #endif return((CONTEXT*)0); } DEBUG(__FILE__,__LINE__,"number of devices %d",cp->ndev); /* XXX XXX TESTING ONLY!!!! */ //_aligned_free(cp); //DEBUG(__FILE__,__LINE__,"MADE IT"); return (CONTEXT*)0; /*** *** create command queues ***/ cp->cmdq = (cl_command_queue*)malloc(sizeof(cl_command_queue)*cp->ndev); DEBUG(__FILE__,__LINE__,"will try to create cmdq"); // XXX something is broken in clCreateCommandQueue, using lazy creation // XXX as a workaround -DAR // { //cl_command_queue_properties prop = 00; //prop |= CL_QUEUE_PROFILING_ENABLE; /* XXX this should be choice -DAR */ for(i=0;i<cp->ndev;i++) { //DEBUG(__FILE__,__LINE__,"%d calling clCreateCommandQueue(%p,%p,%x,%p)",i,cp->ctx,cp->dev[i],prop,&err); #ifdef _WIN64 cp->cmdq[i] = 0; /* have to defer, dllmain limitations */ #else cp->cmdq[i] = clCreateCommandQueue(cp->ctx,cp->dev[i],prop,&err); //cp->cmdq[i] = clCreateCommandQueue(cp->ctx,cp->dev[i],0,&err); //cl_command_queue cmdq = clCreateCommandQueue(cp->ctx,cp->dev[0],0,&err); DEBUG(__FILE__,__LINE__,"clcontext_create: error from create cmdq %d (%p)\n", err,cp->cmdq[i]); #endif //DEBUG(__FILE__,__LINE__,"MADE IT"); return (CONTEXT*)0; } // } // printf("WARNING CMDQs NOT CREATED\n"); // for(i=0;i<cp->ndev;i++) cp->cmdq[i] = (cl_command_queue)0; /*** *** init context resources ***/ LIST_INIT(&cp->prgs_listhead); LIST_INIT(&cp->txt_listhead); LIST_INIT(&cp->memd_listhead); // struct _prgs_struct* prgs // = (struct _prgs_struct*)malloc(sizeof(struct _prgs_struct)); // prgs->len=-1; // LIST_INSERT_HEAD(&cp->prgs_listhead, prgs, prgs_list); // // prgs = (struct _prgs_struct*)malloc(sizeof(struct _prgs_struct)); // prgs->len=-2; // LIST_INSERT_HEAD(&cp->prgs_listhead, prgs, prgs_list); /* printf("%p searching _proc_cl for prgs...\n",_proc_cl.clstrtab); printf("%s\n",&_proc_cl.clstrtab[1]); struct clprgs_entry* sp; for(n=0,sp=_proc_cl.clprgs;n<_proc_cl.clprgs_n;n++,sp++) { printf("found %s (%d bytes)\n",&_proc_cl.clstrtab[sp->e_name],sp->e_size); struct _prgs_struct* prgs = (struct _prgs_struct*) clload(cp,_proc_cl.cltexts+sp->e_offset,sp->e_size,0); } */ /*** *** initialize event lists ***/ // cp->nkev = cp->kev_first = cp->kev_free = 0; cp->kev = (struct _event_list_struct*) malloc(cp->ndev*sizeof(struct _event_list_struct)); for(i=0;i<cp->ndev;i++) { cp->kev[i].nev = cp->kev[i].ev_first = cp->kev[i].ev_free = 0; } // cp->nmev = cp->mev_first = cp->mev_free = 0; cp->mev = (struct _event_list_struct*) malloc(cp->ndev*sizeof(struct _event_list_struct)); for(i=0;i<cp->ndev;i++) { cp->mev[i].nev = cp->mev[i].ev_first = cp->mev[i].ev_free = 0; } //#ifdef ENABLE_CLEXPORT // cp->ndev_v = 0; // cp->extd = 0; // cp->imtd = 0; //#endif if (platforms) free(platforms); return(cp); }
cl_int OCL_Environment::init(OCL_Environment_Desc desc) { OCL_LoadLibrary(); // Get number of available platforms OCL_RETURN_ON_ERR( clGetPlatformIDs( 0, NULL, &uiNumPlatforms ) ); if( !(uiNumPlatforms > 0) ) { printf("No available platform!"); abort(); } // Create platforms cl_platform_id* pPlatformIDs = (cl_platform_id*)calloc( sizeof(cl_platform_id), uiNumPlatforms ); OCL_RETURN_ON_ERR( clGetPlatformIDs( uiNumPlatforms, pPlatformIDs, NULL ) ); // Alloc OCL_Platforms if( desc.sPlatformName ) mpPlatforms = new OCL_Platform[1]; else mpPlatforms = new OCL_Platform[uiNumPlatforms]; bool founddev = false; for( cl_uint i=0; i<uiNumPlatforms; i++ ) { char* sPlatname; size_t nameSize; OCL_RETURN_ON_ERR( clGetPlatformInfo( pPlatformIDs[i], CL_PLATFORM_NAME, 0, NULL, &nameSize ) ); sPlatname = new char[nameSize]; OCL_RETURN_ON_ERR( clGetPlatformInfo( pPlatformIDs[i], CL_PLATFORM_NAME, nameSize, sPlatname, NULL ) ); if( desc.sPlatformName ) { if( strcmp(sPlatname,desc.sPlatformName ) == 0 ) { uiNumPlatforms = 1; OCL_RETURN_ON_ERR( mpPlatforms[0].init(pPlatformIDs[i],desc) ); founddev = true; break; } } else { OCL_RETURN_ON_ERR( mpPlatforms[i].init(pPlatformIDs[i],desc) ); if(mpPlatforms[i].uiNumDevices > 0) founddev = true;//at least 1 compatible device found } } delete pPlatformIDs; //if no device found return the error if(!founddev) OCL_RETURN_ON_ERR(CL_DEVICE_NOT_FOUND); return CL_SUCCESS; }
static cl::Context PlatformContext(cl_device_type device_type, char* platform_vendor_name, bool enable_gl_interop = false) { cl_uint numPlatforms; cl_platform_id platform = NULL; clGetPlatformIDs(0, NULL, &numPlatforms); if (numPlatforms > 0) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; clGetPlatformIDs(numPlatforms, platforms, NULL); for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); platform = platforms[i]; std::cout << "platform: " << pbuf << std::endl; if (!strcmp(pbuf, platform_vendor_name)) { break; } } delete[] platforms; } if (enable_gl_interop) { // Define OS-specific context properties and create the OpenCL context #if defined (__APPLE__) CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); cl_context_properties cps[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 0 }; #else #if defined(linux) cl_context_properties cps[] = { CL_GL_CONTEXT_KHR, cl_context_properties(glXGetCurrentContext()), CL_GLX_DISPLAY_KHR, cl_context_properties(glXGetCurrentDisplay()), CL_CONTEXT_PLATFORM, cl_context_properties(platform), 0 }; #else // Win32 cl_context_properties cps[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; #endif #endif cl::Platform _platform(platform); cl::vector<cl::Device> *_devices = new cl::vector<cl::Device>(); _platform.getDevices(CL_DEVICE_TYPE_GPU, _devices); if(_devices->size() > 1) _devices->pop_back(); if (platform == NULL) return cl::Context(device_type, NULL); else return cl::Context(*_devices,cps); return (NULL == platform) ? cl::Context(device_type, NULL) : cl::Context(*_devices, cps); }else //no opengl interoperability { cl_context_properties cps[] = { CL_CONTEXT_PLATFORM, cl_context_properties(platform), 0 }; return (NULL == platform) ? cl::Context(device_type, NULL) : cl::Context(device_type, cps); } }
int main() { // This code executes on the OpenCL host // Host data int *A = NULL; // Input array int *B = NULL; // Input array int *C = NULL; // Output array // Elements in each array const int elements = 2048; // Compute the size of the data size_t datasize = sizeof(int)*elements; // Allocate space for input/output data A = (int*)malloc(datasize); B = (int*)malloc(datasize); C = (int*)malloc(datasize); // Initialize the input data int i; for(i = 0; i < elements; i++) { A[i] = i; B[i] = i; } // Use this to check the output of each API call cl_int status; // Retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); // Allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); cl_int platform_index = -1; char cBuffer[1024]; for (i=0; i<numPlatforms; i++) { clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); if (strstr(cBuffer, "Intel") != NULL) { platform_index = i; break; } } if (platform_index < 0) { printf("Cannot find platforms support OpenCL.\n"); return -1; } else { printf("Selected platform '%s'. %d\n", cBuffer, platform_index); } // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[platform_index], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); // Allocate enough space for each device cl_device_id *devices; devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[platform_index], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); // Create a buffer object that will contain the data // from the host array A cl_mem bufA; bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will contain the data // from the host array B cl_mem bufB; bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will hold the output data cl_mem bufC; bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status); // Write input array A to the device buffer bufferA status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE, 0, datasize, A, 0, NULL, NULL); // Write input array B to the device buffer bufferB status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE, 0, datasize, B, 0, NULL, NULL); // Create a program with source code cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status); // Build (compile) the program for the device status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); // Create the vector addition kernel cl_kernel kernel; kernel = clCreateKernel(program, "vecadd", &status); // Associate the input and output buffers with the kernel status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB); status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC); // Define an index space (global work size) of work // items for execution. A workgroup size (local work size) // is not required, but can be used. size_t globalWorkSize[1]; // There are 'elements' work-items globalWorkSize[0] = elements; // Execute the kernel for execution status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); // Read the device output buffer to the host output array clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0, datasize, C, 0, NULL, NULL); // Verify the output int result = 1; for(i = 0; i < elements; i++) { if(C[i] != i+i) { result = 0; break; } } if(result) { printf("Output is correct\n"); } else { printf("Output is incorrect\n"); } // Free OpenCL resources clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC); clReleaseContext(context); // Free host resources free(A); free(B); free(C); free(platforms); free(devices); return 0; }
struct cl_package initFPGA( const char* xclbin, const char* kernel_name ) { /*****************************************/ /* Initialize OpenCL */ /*****************************************/ // Retrieve the number of platforms cl_uint numPlatforms = 0; cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); //printf("Found %d platforms support OpenCL, return code %d.\n", numPlatforms, status); // Allocate enough space for each platform cl_platform_id *platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) printf("clGetPlatformIDs error(%d)\n", status); // Retrieve the number of devices cl_uint numDevices = 0; #ifndef FPGA_DEVICE status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); #else status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices); #endif printf("Found %d devices support OpenCL.\n", numDevices); // Allocate enough space for each device cl_device_id *devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices #ifndef FPGA_DEVICE status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); #else status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL); #endif if (status != CL_SUCCESS) printf("clGetDeviceIDs error(%d)\n", status); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); if (status != CL_SUCCESS) printf("clCreateContext error(%d)\n", status); //Create a command-queue cl_command_queue clCommandQue = clCreateCommandQueue(context, devices[0], 0, &status); if (status != CL_SUCCESS) printf("clCreateCommandQueue error(%d)\n", status); // 6. Load and build OpenCL kernel #ifndef FPGA_DEVICE // Create a program with source code cl_program program = clCreateProgramWithSource(context, 1, (const char**)&logistic_cl, NULL, &status); if (status != 0) printf("clCreateProgramWithSource error(%d)\n", status); // Build (compile) the program for the device status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); #else // Load binary from disk unsigned char *kernelbinary; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin); exit(1); } size_t n_bit = n_i; // Create the compute program from offline cl_program program = clCreateProgramWithBinary(context, 1, &devices[0], &n_bit, (const unsigned char **) &kernelbinary, NULL, &status); if ((!program) || (status != CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", status); exit(1); } // Build the program executable status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); #endif if (status != 0) { char errmsg[2048]; size_t sizemsg = 0; status = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 2048*sizeof(char), errmsg, &sizemsg); printf("clBuildProgram error(%d)\n", status); printf("Compilation messages: \n %s", errmsg); } cl_kernel clKernel = clCreateKernel(program, kernel_name, &status); if (status != CL_SUCCESS) printf("clCreateKernel error(%d)\n", status); // TODO: parameterize the size of buffers cl_mem d_gradient = clCreateBuffer(context, CL_MEM_READ_WRITE, FEATURE_SIZE*LABEL_SIZE*GROUP_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_gradient clCreateBuffer error(%d)\n", status); cl_mem d_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, FEATURE_SIZE*LABEL_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_weights clCreateBuffer error(%d)\n", status); cl_mem d_data = clCreateBuffer(context, CL_MEM_READ_ONLY, (FEATURE_SIZE+LABEL_SIZE)*CHUNK_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_data clCreateBuffer error(%d)\n", status); struct cl_package result; result.context = context; result.kernel = clKernel; result.commandQueue = clCommandQue; result.d_gradient = d_gradient; result.d_weights = d_weights; result.d_data = d_data; return result; }
int main(int argc, char ** argv) { if(argc != 3){ printf("wrong option... \n"); return 0; } char *needleData = argv[1]; int needleLen = strlen(needleData); int cccharsPerItem; int ggcharsPerItem; int characterSetSize = 128;//ASCii set int *skipTable; // skipTable char *needle = &needleData[0];//!!!Pass it in a wrong way skipTable = HorspoolPrecomputation(needle, characterSetSize); char *cckernelFile = "./kernel/HS_CPU_LocalCounter_OpenCL_Kernel.cl";//the kernel for CPU char *ggkernelFile = "./kernel/HS_GPU_LocalMem_LocalCounter_OpenCL_Kernel.cl";//the kernel for GPU char *cckernelSrc = LoadKernelSrcFromFile(cckernelFile); char *ggkernelSrc = LoadKernelSrcFromFile(ggkernelFile); char *fileName = argv[2]; //Load the haystacks from file FILE *filePtr; filePtr = fopen(fileName, "r"); if(!filePtr) { fprintf(stderr, "can not open the text file!\n"); } fseek(filePtr, 0 , SEEK_END); int fileSize = ftell(filePtr); int ggfileSize = fileSize*0.5; int ccfileSize = fileSize - ggfileSize; rewind(filePtr); //Split the file then I need overlaping! char *gghaystackData = (char*)calloc(ggfileSize, sizeof(char)); char *cchaystackData = (char*)calloc(ccfileSize, sizeof(char)); int textLength; textLength = fread(gghaystackData, sizeof(char), ggfileSize, filePtr); textLength += fread(cchaystackData, sizeof(char), ccfileSize, filePtr); if(textLength != fileSize) { fprintf(stderr, "reading error"); } fclose(filePtr); //~ char *cchaystackData = LoadHaystackDataFromFile(fileName); int cchaystackLen = strlen(cchaystackData); int gghaystackLen = strlen(gghaystackData); cl_int err; cl_platform_id platform; cl_device_id cdevice; cl_device_id ggdevice; //Find a platform err = clGetPlatformIDs(1, &platform, NULL); if(err != CL_SUCCESS) { printf("cant find a platform! \n"); } //Set up devices err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &cdevice, NULL); if(err != CL_SUCCESS) { printf("cant find CPU! \n"); } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ggdevice, NULL); if(err != CL_SUCCESS) { printf("cant find GPU! \n"); } //Create context cl_context ccontext = clCreateContext(NULL, 1, &cdevice, NULL, NULL, &err); if(err != CL_SUCCESS) { printf("cant create a ccontext! \n"); } cl_context ggcontext = clCreateContext(NULL, 1, &ggdevice, NULL, NULL, &err); if(err != CL_SUCCESS) { printf("cant create a ggcontext! \n"); } //Create command queues cl_command_queue cqueue = clCreateCommandQueue(ccontext, cdevice, 0, &err); if(err != CL_SUCCESS) { printf("cant create a cqueue! \n"); } cl_command_queue ggqueue = clCreateCommandQueue(ggcontext, ggdevice, 0, &err); if(err != CL_SUCCESS) { printf("cant create a ggqueue! \n"); } //Create the programm object cl_program cprogram = clCreateProgramWithSource(ccontext, 1, (const char**)&cckernelSrc, NULL, &err); if(err != CL_SUCCESS) { printf("cant build the program! \n"); } cl_program ggprogram = clCreateProgramWithSource(ggcontext, 1, (const char**)&ggkernelSrc, NULL, &err); if(err != CL_SUCCESS) { printf("cant build the program! \n"); } //Build the programm executable err = clBuildProgram(cprogram, 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { printf("cant build the cprogramm exe! \n"); } err = clBuildProgram(ggprogram, 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { printf("cant build the ggprogramm exe! \n"); } //Create the kernel cl_kernel ckernel = clCreateKernel(cprogram, "QSMatch", &err); if(err != CL_SUCCESS) { printf("cant create the ckernel! \n"); } cl_kernel ggkernel = clCreateKernel(ggprogram, "QSMatch", &err); if(err != CL_SUCCESS) { printf("cant create the ggkernel! \n"); } //Create the haystack buffer cl_mem cchaystackBuffer = clCreateBuffer(ccontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cchaystackLen, cchaystackData, &err); if(err != CL_SUCCESS) { printf("couldn't create the cchaystackBuffer \n"); } cl_mem gghaystackBuffer = clCreateBuffer(ggcontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, gghaystackLen, gghaystackData, &err); if(err != CL_SUCCESS) { printf("couldn't create the gghaystackBuffer \n"); } //Create the results buffer int ccres[4] = {0}; int ggres[80] = {0}; cl_mem ccresBuffer = clCreateBuffer(ccontext, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(ccres), ccres, &err); if(err != CL_SUCCESS){ printf("couldn't create the ccresBuffer"); } cl_mem ggresBuffer = clCreateBuffer(ggcontext, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(ggres), ggres, &err); if(err != CL_SUCCESS){ printf("couldn't create the ggresBuffer"); } //Create the needleBuffer cl_mem ccneedleBuffer = clCreateBuffer(ccontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(char)*needleLen, needleData, &err); if(err != CL_SUCCESS){ printf("couldn't create the ccneedleBuffer \n"); } cl_mem ggneedleBuffer = clCreateBuffer(ggcontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(char)*needleLen, needleData, &err); if(err != CL_SUCCESS){ printf("couldn't create the ggneedleBuffer \n"); } //Create the skipTableBuffer cl_mem ccskipTableBuffer = clCreateBuffer(ccontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*characterSetSize, skipTable, &err); if(err != CL_SUCCESS){ printf("couldn't create the ccskipTableBuffer \n"); } cl_mem ggskipTableBuffer = clCreateBuffer(ggcontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*characterSetSize, skipTable, &err); if(err != CL_SUCCESS){ printf("couldn't create the ggskipTableBuffer \n"); } //Determine global size and local size size_t cclocalSize = 64; size_t gglocalSize = 256; size_t ccGlobalSize = 64*4; size_t ggGlobalSize = 256*5*8; cccharsPerItem = cchaystackLen/ccGlobalSize + 1;//Add 1 it important otherwise some patterns will be lost ggcharsPerItem = cchaystackLen/ggGlobalSize + 1; //Set the kernel arguments for CPU err = clSetKernelArg(ckernel, 0, sizeof(cl_mem), &cchaystackBuffer); err |= clSetKernelArg(ckernel, 1, sizeof(cl_mem), &ccneedleBuffer); err |= clSetKernelArg(ckernel, 2, sizeof(cl_mem), &ccskipTableBuffer); err |= clSetKernelArg(ckernel, 3, sizeof(int)*1, NULL); err |= clSetKernelArg(ckernel, 4, sizeof(needleLen), &needleLen); err |= clSetKernelArg(ckernel, 5, sizeof(cccharsPerItem), &cccharsPerItem); err |= clSetKernelArg(ckernel, 6, sizeof(cl_mem), &ccresBuffer); if(err != CL_SUCCESS) { printf("couldn't set the ckernel arguments \n"); } //Set the kernel arguments for GPU err = clSetKernelArg(ggkernel, 0, sizeof(cl_mem), &gghaystackBuffer); err |= clSetKernelArg(ggkernel, 1, sizeof(cl_mem), &ggneedleBuffer); err |= clSetKernelArg(ggkernel, 2, sizeof(char)*needleLen, NULL); err |= clSetKernelArg(ggkernel, 3, sizeof(cl_mem), &ggskipTableBuffer); err |= clSetKernelArg(ggkernel, 4, sizeof(int)*128, NULL); err |= clSetKernelArg(ggkernel, 5, sizeof(int)*1, NULL); err |= clSetKernelArg(ggkernel, 6, sizeof(needleLen), &needleLen); err |= clSetKernelArg(ggkernel, 7, sizeof(ggcharsPerItem), &ggcharsPerItem); err |= clSetKernelArg(ggkernel, 8, sizeof(cl_mem), &ggresBuffer); if(err != CL_SUCCESS) { printf("couldn't set the ggkernel arguments \n"); } //Execut the kernel //On CPU err = clEnqueueNDRangeKernel(cqueue, ckernel, 1, NULL, &ccGlobalSize, &cclocalSize, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("ckernel could not be executed... \n"); } else { printf("ckernel has been executed successfully! \n"); } //~ //On GPU err = clEnqueueNDRangeKernel(ggqueue, ggkernel, 1, NULL, &ggGlobalSize, &gglocalSize, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("ggkernel could not be executed... \n"); } else { printf("ggkernel has been executed successfully! \n"); } //Finish the queue lists clFinish(ggqueue); clFinish(cqueue); //Copy the results clEnqueueReadBuffer(cqueue, ccresBuffer, CL_TRUE, 0, sizeof(ccres), ccres, 0, NULL, NULL); clEnqueueReadBuffer(ggqueue, ggresBuffer, CL_TRUE, 0, sizeof(ggres), ggres, 0, NULL, NULL); PrintTheResults(ccres, ggres); //Clean up free(skipTable); free(cckernelSrc); free(ggkernelSrc); free(cchaystackData); free(gghaystackData); clReleaseMemObject(cchaystackBuffer); clReleaseMemObject(ccneedleBuffer); clReleaseMemObject(ccskipTableBuffer); clReleaseMemObject(ccresBuffer); clReleaseMemObject(gghaystackBuffer); clReleaseMemObject(ggneedleBuffer); clReleaseMemObject(ggskipTableBuffer); clReleaseMemObject(ggresBuffer); clReleaseKernel(ckernel); clReleaseKernel(ggkernel); clReleaseProgram(cprogram); clReleaseProgram(ggprogram); clReleaseCommandQueue(cqueue); clReleaseCommandQueue(ggqueue); clReleaseContext(ccontext); clReleaseContext(ggcontext); return 0; }
void mandelbrot(int m, int n) { cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_GPU; cl_device_id *devs = NULL; cl_context context; cl_command_queue *cmd_queues; cl_program program; cl_kernel *kernels; cl_mem *mem_R; cl_mem *mem_G; cl_mem *mem_B; cl_int err; cl_uint num_platforms; cl_uint num_devs = 0; cl_event *ev_kernels; int count_max = COUNT_MAX; int i, j, jhi, jlo; char *output_filename = "mandelbrot.ppm"; FILE *output_unit; double wtime; float x_max = 1.25; float x_min = - 2.25; // float x; // float x1; // float x2; float y_max = 1.75; float y_min = - 1.75; //float y; //float y1; //float y2; size_t size_color; size_color = sizeof(int) * m * n; int (*r)[n] = (int (*)[n])calloc(m * n, sizeof(int)); int (*g)[n] = (int (*)[n])calloc(m * n, sizeof(int)); int (*b)[n] = (int (*)[n])calloc(m * n, sizeof(int)); printf( " Sequential C version\n" ); printf( "\n" ); printf( " Create an ASCII PPM image of the Mandelbrot set.\n" ); printf( "\n" ); printf( " For each point C = X + i*Y\n" ); printf( " with X range [%g,%g]\n", x_min, x_max ); printf( " and Y range [%g,%g]\n", y_min, y_max ); printf( " carry out %d iterations of the map\n", count_max ); printf( " Z(n+1) = Z(n)^2 + C.\n" ); printf( " If the iterates stay bounded (norm less than 2)\n" ); printf( " then C is taken to be a member of the set.\n" ); printf( "\n" ); printf( " An ASCII PPM image of the set is created using\n" ); printf( " M = %d pixels in the X direction and\n", m ); printf( " N = %d pixels in the Y direction.\n", n ); timer_init(); timer_start(0); // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); num_devs = 1; //** if (num_devs >= 1) { devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs); err = clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL); break; } } if ( devs == NULL || num_devs < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } for( i = 0; i < num_devs; ++i ) { printf("dev[%d] : ", i); print_device_name(devs[i]); } // Context context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs); for( i = 0; i < num_devs; ++i) { cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err); CHECK_ERROR(err); } // Create a program. size_t source_len; char *source_code = get_source_code("./mandelbrot_kernel.cl", &source_len); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_len, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-Dm=%d -Dn=%d -Dnum_devs=%d", m, n, num_devs); err = clBuildProgram(program, num_devs, devs, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, devs[0]); CHECK_ERROR(err); } // Kernel kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs); for (i = 0; i < num_devs; i++) { kernels[i] = clCreateKernel(program, "mandelbrot_kernel", NULL); } // Buffers mem_R = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); mem_G = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); mem_B = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); for(i = 0; i < num_devs; i++) { mem_R[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); mem_G[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); mem_B[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); } /* // Write to Buffers for(i = 0; i < num_devs; i++) { clEnqueueWriteBuffer(cmd_queues[i], mem_CHECK[i], CL_FALSE, 0, size_CHECK / num_devs, (CHECK + (N / num_devs) * i), 0, NULL, NULL); } */ // Set the arguments. for(i = 0; i < num_devs; i++) { // flag = i * (m * n / num_devs); clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*) &mem_R[i]); clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*) &mem_G[i]); clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*) &mem_B[i]); clSetKernelArg(kernels[i], 3, sizeof(int), &count_max); clSetKernelArg(kernels[i], 4, sizeof(float), &x_max); clSetKernelArg(kernels[i], 5, sizeof(float), &x_min); clSetKernelArg(kernels[i], 6, sizeof(float), &y_max); clSetKernelArg(kernels[i], 7, sizeof(float), &y_min); } // Enqueue the kernel. size_t lws[1] = {256}; size_t gws[1] = { m * n /num_devs }; gws[0] = (size_t)ceil((double)m * n / lws[0]) * lws[0]; ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs); for(i = 0; i < num_devs; i++) { err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 1, NULL, gws, lws, 0, NULL, &ev_kernels[i]); CHECK_ERROR(err); } // Read the result. for(i = 0; i < num_devs; i++) { err = clEnqueueReadBuffer(cmd_queues[i], mem_R[i], CL_TRUE, 0, size_color / num_devs, r, 1, &ev_kernels[i], NULL); err = clEnqueueReadBuffer(cmd_queues[i], mem_G[i], CL_TRUE, 0, size_color / num_devs, g, 1, &ev_kernels[i], NULL); err = clEnqueueReadBuffer(cmd_queues[i], mem_B[i], CL_TRUE, 0, size_color / num_devs, b, 1, &ev_kernels[i], NULL); } // Release for( i = 0; i < num_devs; ++i ) { clFinish(cmd_queues[i]); clReleaseMemObject(mem_R[i]); clReleaseMemObject(mem_G[i]); clReleaseMemObject(mem_B[i]); clReleaseKernel(kernels[i]); clReleaseCommandQueue(cmd_queues[i]); clReleaseEvent(ev_kernels[i]); } clReleaseProgram(program); clReleaseContext(context); free(mem_R); free(mem_G); free(mem_B); free(cmd_queues); free(kernels); free(devs); free(ev_kernels); free(platform); timer_stop(0); wtime = timer_read(0); printf( "\n" ); printf( " Time = %lf seconds.\n", wtime ); // Write data to an ASCII PPM file. output_unit = fopen( output_filename, "wt" ); fprintf( output_unit, "P3\n" ); fprintf( output_unit, "%d %d\n", n, m ); fprintf( output_unit, "%d\n", 255 ); for ( i = 0; i < m; i++ ) { for ( jlo = 0; jlo < n; jlo = jlo + 4 ) { jhi = MIN( jlo + 4, n ); for ( j = jlo; j < jhi; j++ ) { fprintf( output_unit, " %d %d %d", r[i][j], g[i][j], b[i][j] ); } fprintf( output_unit, "\n" ); } } fclose( output_unit ); printf( "\n" ); printf( " Graphics data written to \"%s\".\n\n", output_filename ); // Terminate. free(r); free(g); free(b); }
int main() { // This code executes on the OpenCL host // Host data int *A = NULL; // Input array int *B = NULL; // Input array int *C = NULL; // Output array // Elements in each array const int elements = 2048; // Compute the size of the data size_t datasize = sizeof(int)*elements; // Allocate space for input/output data A = (int*)malloc(datasize); B = (int*)malloc(datasize); C = (int*)malloc(datasize); // Initialize the input data for(int i = 0; i < elements; i++) { A[i] = i; B[i] = i; } // Use this to check the output of each API call cl_int status; //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- cl_uint numPlatforms = 0; cl_platform_id *platforms = NULL; // Use clGetPlatformIDs() to retrieve the number of // platforms status = clGetPlatformIDs(0, NULL, &numPlatforms); // Allocate enough space for each platform platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in platforms with clGetPlatformIDs() status = clGetPlatformIDs(numPlatforms, platforms, NULL); //----------------------------------------------------- // STEP 2: Discover and initialize the devices //----------------------------------------------------- cl_uint numDevices = 0; cl_device_id *devices = NULL; // Use clGetDeviceIDs() to retrieve the number of // devices present status = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); // Allocate enough space for each device devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in devices with clGetDeviceIDs() status = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- cl_context context = NULL; // Create a context using clCreateContext() and // associate it with the devices context = clCreateContext( NULL, numDevices, devices, NULL, NULL, &status); //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- cl_command_queue cmdQueue; // Create a command queue using clCreateCommandQueue(), // and associate it with the device you want to execute // on cmdQueue = clCreateCommandQueue( context, devices[0], 0, &status); //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- cl_mem bufferA; // Input array on the device cl_mem bufferB; // Input array on the device cl_mem bufferC; // Output array on the device // Use clCreateBuffer() to create a buffer object (d_A) // that will contain the data from the host array A bufferA = clCreateBuffer( context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Use clCreateBuffer() to create a buffer object (d_B) // that will contain the data from the host array B bufferB = clCreateBuffer( context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Use clCreateBuffer() to create a buffer object (d_C) // with enough space to hold the output data bufferC = clCreateBuffer( context, CL_MEM_WRITE_ONLY, datasize, NULL, &status); //----------------------------------------------------- // STEP 6: Write host data to device buffers //----------------------------------------------------- // Use clEnqueueWriteBuffer() to write input array A to // the device buffer bufferA status = clEnqueueWriteBuffer( cmdQueue, bufferA, CL_FALSE, 0, datasize, A, 0, NULL, NULL); // Use clEnqueueWriteBuffer() to write input array B to // the device buffer bufferB status = clEnqueueWriteBuffer( cmdQueue, bufferB, CL_FALSE, 0, datasize, B, 0, NULL, NULL); //----------------------------------------------------- // STEP 7: Create and compile the program //----------------------------------------------------- // Create a program using clCreateProgramWithSource() cl_program program = clCreateProgramWithSource( context, 1, (const char**)&programSource, NULL, &status); // Build (compile) the program for the devices with // clBuildProgram() status = clBuildProgram( program, numDevices, devices, NULL, NULL, NULL); //----------------------------------------------------- // STEP 8: Create the kernel //----------------------------------------------------- cl_kernel kernel = NULL; // Use clCreateKernel() to create a kernel from the // vector addition function (named "vecadd") kernel = clCreateKernel(program, "vecadd", &status); //----------------------------------------------------- // STEP 9: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &bufferA); status |= clSetKernelArg( kernel, 1, sizeof(cl_mem), &bufferB); status |= clSetKernelArg( kernel, 2, sizeof(cl_mem), &bufferC); //----------------------------------------------------- // STEP 10: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. size_t globalWorkSize[1]; // There are 'elements' work-items globalWorkSize[0] = elements; //----------------------------------------------------- // STEP 11: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items status = clEnqueueNDRangeKernel( cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); //----------------------------------------------------- // STEP 12: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) clEnqueueReadBuffer( cmdQueue, bufferC, CL_TRUE, 0, datasize, C, 0, NULL, NULL); // Verify the output bool result = true; for(int i = 0; i < elements; i++) { if(C[i] != i+i) { result = false; break; } } if(result) { printf("Output is correct\n"); } else { printf("Output is incorrect\n"); } //----------------------------------------------------- // STEP 13: Release OpenCL resources //----------------------------------------------------- // Free OpenCL resources clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(bufferA); clReleaseMemObject(bufferB); clReleaseMemObject(bufferC); clReleaseContext(context); // Free host resources free(A); free(B); free(C); free(platforms); free(devices); }
/** * @brief Main program function. * * @param argc Number of cli parameters. If argc > 1, more detailed information will be shown. * @param argv Not relevant. * @return */ int main(int argc, char ** argv) { /* Program variables. */ GError* err = NULL; /* Error reporting object. */ cl_int status; /* Program/function return status variable. */ cl_uint numPlatforms; /* Number of platforms. */ cl_platform_id* platforms = NULL; /* Array of platform IDs. */ cl_uint numDevices; /* Number of devices. */ cl_device_id devices[MAX_DEVICES_QUERY]; /* Array of devices for a given platform. */ /* Auxiliary variables for getting information about platforms and devices. */ char pbuff[MAX_INFO_STRING]; size_t sizetaux; cl_uint uintaux; cl_ulong ulongaux, ulongaux2; cl_device_type dtypeaux; cl_device_local_mem_type dlmt; cl_bool boolaux; cl_command_queue_properties cqpaux; /* Avoid compiler warning. */ argv = argv; /* Get number of platforms. */ status = clGetPlatformIDs(0, NULL, &numPlatforms); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get number of platforms.", status); /* Get platform IDs. */ platforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id)); gef_if_error_create_goto(err, CLU_UTILS_ERROR, platforms == NULL, CLU_ERROR_NOALLOC, error_handler, "Unable to allocate memory for list of platform IDs."); status = clGetPlatformIDs(numPlatforms, platforms, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get list of platform IDs.", status); /* Print number of platforms. */ printf("Number of platforms: %d\n", numPlatforms); /* Cycle through platforms */ for(unsigned int i = 0; i < numPlatforms; i++) { /* Get platform vendor. */ status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get platform vendor.", status); /* Print plaform vendor. */ printf("Platform #%d: %s\n", i, pbuff); /* Get devices in platform */ status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, MAX_DEVICES_QUERY, devices, &numDevices); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to platform devices.", status); /* Cycle through devices in current platform. */ for (unsigned int j = 0; j < numDevices; j++) { /* Device name. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get device name.", status); printf("\tDevice #%d: %s\n", j, pbuff); /* Device vendor. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(pbuff), pbuff, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get device vendor.", status); printf("\t Vendor: %s\n", pbuff); /* Device type. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(cl_device_type), &dtypeaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get device type.", status); printf("\t Type: %s\n", clu_device_type_str_get(dtypeaux, 0, pbuff, MAX_INFO_STRING)); /* OpenCL C version. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, sizeof(pbuff), pbuff, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get device OpenCL C version.", status); printf("\t %s\n", pbuff); /* Max. compute units. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uintaux), &uintaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get device max. compute units.", status); printf("\t Max. Compute units: %d\n", uintaux); /* Global memory info. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(ulongaux), &ulongaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get device global memory size.", status); status = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(ulongaux2), &ulongaux2, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get device maximum allocable memory.", status); status = clGetDeviceInfo(devices[j], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(boolaux), &boolaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get determine if device memory is unified with host.", status); printf("\t Global mem. size: %ld Mb %s (max. alloc. %ld Mb)\n", (unsigned long int) ulongaux / 1024l / 1024l, boolaux ? "shared with host" : "dedicated", (unsigned long int) ulongaux2 / 1024l / 1024l); /* Local memory info. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof(dlmt), &dlmt, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get type of local memory in device.", status); status = clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(ulongaux), &ulongaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get size of local memory in device.", status); printf("\t Local mem. size (type): %ld Kb (%s)\n", (unsigned long int) ulongaux / 1024l, (dlmt == CL_LOCAL ? "local" : "global")); /* Maximum work group size. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(sizetaux), &sizetaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get size of local memory in device.", status); printf("\t Max. work-group size: %d\n", (int) sizetaux); /* Print extra info if any arg is given */ if (argc > 1) { /* Maximum constant buffer size. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(ulongaux), &ulongaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get maximum constant buffer size.", status); printf("\t Max. constant buffer size: %lu Kb\n", (unsigned long) (ulongaux / 1024)); /* Device endianess.*/ status = clGetDeviceInfo(devices[j], CL_DEVICE_ENDIAN_LITTLE, sizeof(boolaux), &boolaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get device endianess.", status); printf("\t Endianness: %s\n", boolaux ? "Little" : "Big"); /* Preferred vector width. */ printf("\t Pref. vec. width:"); status = clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(uintaux), &uintaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get preferred vector width for char.", status); printf(" Char=%d,", uintaux); status = clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(uintaux), &uintaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get preferred vector width for short.", status); printf(" Short=%d,", uintaux); status = clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(uintaux), &uintaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get preferred vector width for int.", status); printf(" Int=%d,", uintaux); status = clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(uintaux), &uintaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get preferred vector width for long.", status); printf(" Long=%d,", uintaux); status = clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(uintaux), &uintaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get preferred vector width for float.", status); printf(" Float=%d,", uintaux); status = clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(uintaux), &uintaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get preferred vector width for double.", status); printf(" Double=%d,", uintaux); status = clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, sizeof(uintaux), &uintaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get preferred vector width for half.", status); printf(" Half=%d.\n", uintaux); /* Acceptable command queue properties. */ status = clGetDeviceInfo(devices[j], CL_DEVICE_QUEUE_PROPERTIES, sizeof(cqpaux), &cqpaux, NULL); gef_if_error_create_goto(err, CLU_UTILS_ERROR, CL_SUCCESS != status, status, error_handler, "OpenCL error %d: unable to get acceptable command queue properties.", status); printf("\t Command queue properties:"); if (cqpaux & CL_QUEUE_PROFILING_ENABLE) printf(" Prof. OK,"); else printf("Prof. KO,"); if (cqpaux & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) printf(" Out-of-order OK\n"); else printf(" Out-of-order KO\n"); } } } /* If we get here, no need for error checking, jump to cleanup. */ g_assert (err == NULL); status = CL_SUCCESS; goto cleanup; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert (err != NULL); fprintf(stderr, "%s", err->message); status = err->code; g_error_free(err); cleanup: /* Free stuff! */ if (platforms) free(platforms); /* Return status. */ return status; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place vectors inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSswap( N, bufX, 0, incx, bufY, 0, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSswap() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* At this point you will get the result of SSWAP placed in vector X. */ printResult(); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'add_sat_ushort2ushort2.cl' */ source_code = read_buffer("add_sat_ushort2ushort2.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "add_sat_ushort2ushort2", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_ushort2 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_ushort2)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_ushort2){{2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ushort2), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ushort2), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_ushort2 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_ushort2)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_ushort2){{2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ushort2), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ushort2), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_ushort2 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_ushort2)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_ushort2)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_ushort2), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_ushort2), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_ushort2)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main(int argc, char *argv[]) { // Declare variables - y', L // Load from file? Declare within host? Type y[K]; Type L[K*K]; Type R[K]; Type m[K]; Complex Xml[K]; int check_result; cl_mem input_y; cl_mem input_L; cl_mem output_xml; // OpenCL-specific variables cl_device_id device_id; cl_platform_id platform_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel SDkernel; cl_int dev_type; cl_int error; cl_event event; FILE *kernel; char *kernelSRC; size_t global[2]; size_t local[2]; if(argc > 1) { kernel = argv[1]; } else printf("\nError - must specify arguments\n"); //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &error); if (!context) { printf("Error: Failed to create a compute context!\n%s\n", err_code(error)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &error); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(error)); return EXIT_FAILURE; } // Create buffers for each argument of kernel // Need to add for R^2 and m input_y = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(Type) * K, y, &err); input_L = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(Type) * K * K, L, &err); output_xml = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(Complex) * K, Xml, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSRC, NULL, &error); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "SD", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, input_y, CL_TRUE, 0, sizeof(Type)*K, y, 0, NULL, NULL); err = clEnqueueWriteBuffer(commands, input_L, CL_TRUE, 0, sizeof(Type)*K*K, L, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Error: could not write buffer\nError code %d\n", err); return EXIT_FAILURE; } err = clSetKernelArg(SDkernel, 0, sizeof(cl_mem), &input_y); err |= clSetKernelArg(SDkernel, 1, sizeof(cl_mem), &input_L); err |= clSetKernelArg(SDkernel, 2, sizeof(cl_mem), &input_R); err |= clSetKernelArg(SDkernel, 3, sizeof(cl_mem), &output_m); err |= clSetKernelArg(SDkernel, 4, sizeof(cl_mem), &output_xml); if(err != CL_SUCCESS) { printf("Error: could not set kernel arguments\nError code %d\n", err); return EXIT_FAILURE; } global[0] = K; global[1] = K; local[0] = K; local[1] = 1; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, (size_t*)&global, (size_t*)&local, 0, NULL, &event); if(err != CL_SUCCESS) { printf("Error: could not set ND range\nError code %d\n", err); return EXIT_FAILURE; } clEnqueueReadBuffer(commands, output_m, CL_TRUE, 0, sizeof(Type)*K, m, 0, NULL, NULL); clEnqueueReadBuffer(commands, output_xml, CL_TRUE, 0, sizeof(Type)*K, m, 0, NULL, NULL); clReleaseMemObject(input_y); clReleaseMemObject(input_L); clReleaseMemObject(output_xml); clReleaseProgram(program); clReleaseKernel(SDkernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int SDKSample::validatePlatformAndDeviceOptions() { cl_int status = CL_SUCCESS; cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { std::cout<<"Error: clGetPlatformIDs failed. Error code : "; std::cout << streamsdk::getOpenCLErrorCodeStr(status) << std::endl; return SDK_FAILURE; } if (0 < numPlatforms) { // Validate platformId if(platformId >= numPlatforms) { if(numPlatforms - 1 == 0) std::cout << "platformId should be 0" << std::endl; else std::cout << "platformId should be 0 to " << numPlatforms - 1 << std::endl; usage(); return SDK_FAILURE; } // Get selected platform cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: clGetPlatformIDs failed. Error code : "; std::cout << streamsdk::getOpenCLErrorCodeStr(status) << std::endl; return SDK_FAILURE; } // Print all platforms for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: clGetPlatformInfo failed. Error code : "; std::cout << streamsdk::getOpenCLErrorCodeStr(status) << std::endl; return SDK_FAILURE; } std::cout << "Platform " << i << " : " << pbuf << std::endl; } // Get AMD platform for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: clGetPlatformInfo failed. Error code : "; std::cout << streamsdk::getOpenCLErrorCodeStr(status) << std::endl; return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } if(isPlatformEnabled()) platform = platforms[platformId]; // Check for AMD platform char pbuf[100]; status = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: clGetPlatformInfo failed. Error code : "; std::cout << streamsdk::getOpenCLErrorCodeStr(status) << std::endl; return SDK_FAILURE; } if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) amdPlatform = true; cl_device_type dType = CL_DEVICE_TYPE_GPU; if(deviceType.compare("cpu") == 0) dType = CL_DEVICE_TYPE_CPU; if(deviceType.compare("gpu") == 0) dType = CL_DEVICE_TYPE_GPU; else dType = CL_DEVICE_TYPE_ALL; // Check for GPU if(dType == CL_DEVICE_TYPE_GPU) { cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context context = clCreateContextFromType(cps, dType, NULL, NULL, &status); if(status == CL_DEVICE_NOT_FOUND) { dType = CL_DEVICE_TYPE_CPU; gpu = false; } clReleaseContext(context); } // Get device count cl_uint deviceCount = 0; status = clGetDeviceIDs(platform, dType, 0, NULL, &deviceCount); if(status != CL_SUCCESS) { std::cout<<"Error: clGetDeviceIDs failed. Error code : "; std::cout << streamsdk::getOpenCLErrorCodeStr(status) << std::endl; return SDK_FAILURE; } // Validate deviceId if(deviceId >= deviceCount) { if(deviceCount - 1 == 0) std::cout << "deviceId should be 0" << std::endl; else std::cout << "deviceId should be 0 to " << deviceCount - 1 << std::endl; usage(); return SDK_FAILURE; } delete[] platforms; } return SDK_SUCCESS; }
void mat_mul_opencl_1d(float *M_A, float *M_B, float *M_C, size_t ROW_A, size_t COL_A, size_t COL_B) { cl_platform_id *platform; cl_device_type dev_type; cl_device_id dev; cl_context context; cl_command_queue cmd_queue; cl_program program; cl_kernel kernel; cl_mem mem_A, mem_B, mem_C; cl_event ev_kernel; cl_int err; cl_uint num_platforms; cl_uint num_dev = 0; int i; // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device dev_type = get_device_type(); for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); if (num_dev == 1) break; } if (num_dev < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } print_device_name(dev); free(platform); // Context context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queue = clCreateCommandQueue(context, dev, CL_QUEUE_PROFILING_ENABLE, &err); CHECK_ERROR(err); // Create a program. char *source_code = get_source_code("./kernel_1d.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, NULL, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu", ROW_A, COL_A, COL_B); err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, dev); CHECK_ERROR(err); } // Kernel kernel = clCreateKernel(program, "mat_mul", &err); CHECK_ERROR(err); // Buffers mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_A, M_A, &err); CHECK_ERROR(err); mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * COL_A * COL_B, NULL, &err); CHECK_ERROR(err); err = clEnqueueWriteBuffer(cmd_queue, mem_B, CL_FALSE, 0, sizeof(float) * COL_A * COL_B, M_B, 0, NULL, NULL); CHECK_ERROR(err) mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * ROW_A * COL_B, NULL, &err); CHECK_ERROR(err); // Set the arguments. err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C); CHECK_ERROR(err); // Enqueue the kernel. size_t lws[1] = {256}; size_t gws[1]; gws[0] = (size_t)ceil((double)ROW_A / lws[0]) * lws[0]; err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, gws, lws, 0, NULL, &ev_kernel); CHECK_ERROR(err); // Read the result. err = clEnqueueReadBuffer(cmd_queue, mem_C, CL_TRUE, 0, sizeof(float) * ROW_A * COL_B, M_C, 0, NULL, NULL); CHECK_ERROR(err); // Read the profiling info. cl_ulong start_time, end_time; err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start_time, NULL); CHECK_ERROR(err); err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end_time, NULL); CHECK_ERROR(err); printf("Kernel time : %lf sec\n", (double)(end_time - start_time) / 10e9); // Release clReleaseEvent(ev_kernel); clReleaseMemObject(mem_A); clReleaseMemObject(mem_B); clReleaseMemObject(mem_C); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; float *X; cl_event event = NULL; int ret = 0; size_t N = 16; char platform_name[128]; char device_name[128]; /* FFT library realted declarations */ clfftPlanHandle planHandle; clfftDim dim = CLFFT_1D; size_t clLengths[1] = {N}; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); size_t ret_param_size = 0; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, &ret_param_size); printf("Platform found: %s\n", platform_name); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &ret_param_size); printf("Device found on the above platform: %s\n", device_name); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clFFT. */ clfftSetupData fftSetup; err = clfftInitSetupData(&fftSetup); err = clfftSetup(&fftSetup); /* Allocate host & initialize data. */ /* Only allocation shown for simplicity. */ X = (float *)malloc(N * 2 * sizeof(*X)); /* print input array */ printf("\nPerforming fft on an one dimensional array of size N = %ld\n", N); int print_iter = 0; while(print_iter<N) { float x = (float)print_iter; float y = (float)print_iter*3; X[2*print_iter ] = x; X[2*print_iter+1] = y; printf("(%f, %f) ", x, y); print_iter++; } printf("\n\nfft result: \n"); /* Prepare OpenCL memory objects and place data inside them. */ bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, N * 2 * sizeof(*X), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL ); /* Create a default plan for a complex FFT. */ err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths); /* Set plan parameters. */ err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); err = clfftSetResultLocation(planHandle, CLFFT_INPLACE); /* Bake the plan. */ err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL); /* Execute the plan. */ err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL); /* Wait for calculations to be finished. */ err = clFinish(queue); /* Fetch results of calculations. */ err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL ); /* print output array */ print_iter = 0; while(print_iter<N) { printf("(%f, %f) ", X[2*print_iter], X[2*print_iter+1]); print_iter++; } printf("\n"); /* Release OpenCL memory objects. */ clReleaseMemObject( bufX ); free(X); /* Release the plan. */ err = clfftDestroyPlan( &planHandle ); /* Release clFFT library. */ clfftTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }
int main(int argc, char** argv) { cl_int error; cl_uint num_platforms; // Get the number of platforms error = clGetPlatformIDs(0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of platforms cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs(num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Print the chosen platform (if there are multiple platforms, choose the first one) cl_platform_id platform = platforms[0]; char pbuf[100]; error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); // Create a GPU context // cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); // if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_int result; cl_device_id device_id = NULL; cl_uint ret_num_devices; result = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); if(result!=CL_SUCCESS){ printf("Runtime error! unable to retrieve the device id of CL_DEVICE_TYPE_DEFAULT\n"); exit(-1); } cl_device_id* __ipmacc_cldevs; __ipmacc_cldevs=(cl_device_id*)malloc(sizeof(cl_device_id)*1); __ipmacc_cldevs[0]=device_id; context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &result); if(result!=CL_SUCCESS){ printf("Runtime error! Cannot open context on the device %d of CL_DEVICE_TYPE_DEFAULT\n",device_id); exit(-1); } // END OF CODE // Get and print the chosen device (if there are multiple devices, choose the first one) size_t devices_size; error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); device = devices[0]; error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); // Create a command queue command_queue = clCreateCommandQueue(context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); int size; int grid_rows,grid_cols = 0; double *FilesavingTemp,*FilesavingPower; //,*MatrixOut; char *tfile, *pfile, *ofile; int total_iterations = 60; int pyramid_height = 1; // number of iterations if (argc < 7) usage(argc, argv); if((grid_rows = atoi(argv[1]))<=0|| (grid_cols = atoi(argv[1]))<=0|| (pyramid_height = atoi(argv[2]))<=0|| (total_iterations = atoi(argv[3]))<=0) usage(argc, argv); tfile=argv[4]; pfile=argv[5]; ofile=argv[6]; size=grid_rows*grid_cols; // --------------- pyramid parameters --------------- int borderCols = (pyramid_height)*EXPAND_RATE/2; int borderRows = (pyramid_height)*EXPAND_RATE/2; int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1); int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1); FilesavingTemp = (double *) malloc(size*sizeof(double)); FilesavingPower = (double *) malloc(size*sizeof(double)); // MatrixOut = (double *) calloc (size, sizeof(double)); if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut) fatal("unable to allocate memory"); // Read input data from disk readinput(FilesavingTemp, grid_rows, grid_cols, tfile); readinput(FilesavingPower, grid_rows, grid_cols, pfile); // Load kernel source from file const char *source = load_kernel_source("hotspot_kernel.cl"); size_t sourceSize = strlen(source); // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create an executable from the kernel error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Show compiler warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); kernel = clCreateKernel(program, "hotspot", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long start_time = get_time(); // Create two temperature matrices and copy the temperature input data cl_mem MatrixTemp[2]; // Create input memory buffers on device MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(double) * size, FilesavingTemp, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(double) * size, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Copy the power input data cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double) * size, FilesavingPower, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Perform the computation int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height, blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower); // Copy final temperature data back cl_double *MatrixOut = (cl_double *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(double) * size, 0, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long end_time = get_time(); printf("Total time: %.3f seconds\n", ((double) (end_time - start_time)) / (1000*1000)); // Write final output to output file #ifdef DUMPOUT writeoutput(MatrixOut, grid_rows, grid_cols, ofile); #endif error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); clReleaseMemObject(MatrixTemp[0]); clReleaseMemObject(MatrixTemp[1]); clReleaseMemObject(MatrixPower); return 0; }
//------------------------------------------------------------------------------ // returns context associated with single device only, // to make it support multiple devices, a list of // <device type, device num> pairs is required cl_context create_cl_context(const std::string& platformName, const std::string& deviceTypeName, int deviceNum) { cl_int status = 0; //1) get platfors and search for platform matching platformName cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, 0, &numPlatforms); check_cl_error(status, "clGetPlatformIDs"); if(numPlatforms < 1) { std::cout << "No OpenCL platforms found" << std::endl; exit(EXIT_SUCCESS); } typedef std::vector< cl_platform_id > PlatformIDs; PlatformIDs platformIDs(numPlatforms); status = clGetPlatformIDs(numPlatforms, &platformIDs[0], 0); check_cl_error(status, "clGetPlatformIDs"); std::vector< char > buf(0x10000, char(0)); cl_platform_id platformID; PlatformIDs::const_iterator pi = platformIDs.begin(); for(; pi != platformIDs.end(); ++pi) { status = clGetPlatformInfo(*pi, CL_PLATFORM_NAME, buf.size(), &buf[0], 0); check_cl_error(status, "clGetPlatformInfo"); if(platformName == &buf[0]) { platformID = *pi; break; } } if(pi == platformIDs.end()) { std::cerr << "ERROR - Couldn't find platform " << platformName << std::endl; exit(EXIT_FAILURE); } //2) get devices of deviceTypeName type and store their ids into // an array then select device id at position deviceNum cl_device_type deviceType; if(deviceTypeName == "default") deviceType = CL_DEVICE_TYPE_DEFAULT; else if(deviceTypeName == "cpu") deviceType = CL_DEVICE_TYPE_CPU; else if(deviceTypeName == "gpu") deviceType = CL_DEVICE_TYPE_GPU; else if(deviceTypeName == "acc") deviceType = CL_DEVICE_TYPE_ACCELERATOR; else if(deviceTypeName == "all") deviceType = CL_DEVICE_TYPE_CPU; else { std::cerr << "ERROR - device type " << deviceTypeName << " unknown" << std::endl; exit(EXIT_FAILURE); } cl_uint numDevices = 0; status = clGetDeviceIDs(platformID, deviceType, 0, 0, &numDevices); check_cl_error(status, "clGetDeviceIDs"); if(numDevices < 1) { std::cerr << "ERROR - Cannot find device of type " << deviceTypeName << std::endl; exit(EXIT_FAILURE); } typedef std::vector< cl_device_id > DeviceIDs; DeviceIDs deviceIDs(numDevices); status = clGetDeviceIDs(platformID, deviceType, numDevices, &deviceIDs[0], 0); check_cl_error(status, "clGetDeviceIDs"); if(deviceNum < 0 || deviceNum >= numDevices) { std::cerr << "ERROR - device number out of range: [0," << (numDevices - 1) << ']' << std::endl; exit(EXIT_FAILURE); } cl_device_id deviceID = deviceIDs[deviceNum]; //3) create and return context cl_context_properties ctxProps[] = { CL_CONTEXT_PLATFORM, cl_context_properties(platformID), 0 }; //only a single device supported cl_context ctx = clCreateContext(ctxProps, 1, &deviceID, &context_callback, 0, &status); check_cl_error(status, "clCreateContext"); return ctx; }
int ocl_t::init() { std::cout << "Query available compute devices ...\n"; cl_int err; cl_uint num; err = clGetPlatformIDs(0, 0, &num); if (err != CL_SUCCESS) { std::cerr << "Unable to get platforms\n"; return 0; } std::vector<cl_platform_id> platforms(num); err = clGetPlatformIDs(num, &platforms[0], &num); if (err != CL_SUCCESS) { std::cerr << "Unable to get platform ID\n"; return 0; } int device_counter = 0; for (size_t platform_id = 0; platform_id < num; platform_id++){ size_t dev_c, info_c; clGetPlatformInfo(platforms[platform_id], CL_PLATFORM_NAME, 0, NULL, &info_c); std::string platname; platname.resize(info_c); clGetPlatformInfo(platforms[platform_id], CL_PLATFORM_NAME, info_c, &platname[0], 0); std::cout << "Platform :" << platname << "\n"; cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[platform_id]), 0 }; context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, NULL); if (context == 0) { std::cerr << "Can't create OpenCL context\n"; return 0; } clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dev_c); std::vector<cl_device_id> devices(dev_c / sizeof(cl_device_id)); clGetContextInfo(context, CL_CONTEXT_DEVICES, dev_c, &devices[0], 0); for (auto i = devices.begin(); i != devices.end(); i++){ clGetDeviceInfo(*i, CL_DEVICE_NAME, 0, NULL, &info_c); std::string devname; devname.resize(info_c); clGetDeviceInfo(*i, CL_DEVICE_NAME, info_c, &devname[0], 0); std::cout << "\tDevice " << device_counter++ << ": " << devname.c_str() << "\n"; pdpair_t pd; pd.device_id = i - devices.begin(); pd.platform_id = platform_id; ocl_device_list.push_back(pd); } clReleaseContext(context); } if (list_available_devices) return 0; cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[ocl_device_list[opencl_device_id].platform_id]), 0 }; context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, NULL); if (context == 0) { std::cerr << "Can't create OpenCL context\n"; return 0; } size_t dev_c, info_c; clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dev_c); std::vector<cl_device_id> devices(dev_c / sizeof(cl_device_id)); clGetContextInfo(context, CL_CONTEXT_DEVICES, dev_c, &devices[0], 0); device_used = devices[ocl_device_list[opencl_device_id].device_id]; clGetDeviceInfo(device_used, CL_DEVICE_NAME, 0, NULL, &info_c); std::string devname; devname.resize(info_c); clGetDeviceInfo(device_used, CL_DEVICE_NAME, info_c, &devname[0], 0); std::cout << "Execute on Device " << opencl_device_id << ": " << devname << std::endl; std::cout << "OK!\n"; queue = clCreateCommandQueue(context, device_used, 0, 0); if (queue == 0) { std::cerr << "Can't create command queue\n"; return 0; } cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float)* iterations, NULL, NULL); if (cl_res == 0) { std::cerr << "Can't create OpenCL buffer\n"; return 0; } FILE* f = fopen("kernel.c", "rb"); fseek(f, 0, SEEK_END); size_t tell = ftell(f); rewind(f); ocl_src_char = (char*)calloc(tell + 1, 1); fread(ocl_src_char, tell, 1, f); initialized = 1; return 0; }
int main(int argc, char *argv[]) { // selected platform and device number cl_uint pn = 0, dn = 0; // OpenCL error cl_int error; // generic iterator cl_uint i; // major/minor version of the platform OpenCL version cl_uint ocl_major, ocl_minor; // set platform/device num from command line if (argc > 1) pn = atoi(argv[1]); if (argc > 2) dn = atoi(argv[2]); error = clGetPlatformIDs(0, NULL, &np); CHECK_ERROR("getting amount of platform IDs"); printf("%u platforms found\n", np); if (pn >= np) { fprintf(stderr, "there is no platform #%u\n" , pn); exit(1); } // only allocate for IDs up to the intended one platform = calloc(pn+1,sizeof(*platform)); // if allocation failed, next call will bomb. rely on this error = clGetPlatformIDs(pn+1, platform, NULL); CHECK_ERROR("getting platform IDs"); // choose platform p = platform[pn]; error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform name"); printf("using platform %u: %s\n", pn, strbuf); error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform version"); // we need 1.2 at least i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor); if (i != 2) { fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n", __func__, __LINE__); exit(1); } if (ocl_major == 1 && ocl_minor < 2) { fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n", __func__, __LINE__, strbuf); exit(1); } error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); CHECK_ERROR("getting amount of device IDs"); printf("%u devices found\n", nd); if (dn >= nd) { fprintf(stderr, "there is no device #%u\n", dn); exit(1); } // only allocate for IDs up to the intended one device = calloc(dn+1,sizeof(*device)); // if allocation failed, next call will bomb. rely on this error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL); CHECK_ERROR("getting device IDs"); // choose device d = device[dn]; error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting device name"); printf("using device %u: %s\n", dn, strbuf); error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(gmem), &gmem, NULL); CHECK_ERROR("getting device global memory size"); error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc_max), &alloc_max, NULL); CHECK_ERROR("getting device max memory allocation size"); // create context ctx_prop[1] = (cl_context_properties)p; ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error); CHECK_ERROR("creating context"); // create queue q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error); CHECK_ERROR("creating queue"); // create program pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error); CHECK_ERROR("creating program"); // build program error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL); CHECK_ERROR("building program"); // get kernel k = clCreateKernel(pg, "add", &error); CHECK_ERROR("creating kernel"); error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(wgm), &wgm, NULL); CHECK_ERROR("getting preferred workgroup size multiple"); // number of elements on which kernel will be launched. it's ok if we don't // cover every byte of the buffers nels = alloc_max/sizeof(cl_float); gws = ROUND_MUL(nels, wgm); printf("will use %zu workitems grouped by %zu to process %u elements\n", gws, wgm, nels); // we will try and allocate at least one buffer more than needed to fill // the device memory, and no less than 3 anyway nbuf = gmem/alloc_max + 1; if (nbuf < 3) nbuf = 3; #define MB (1024*1024.0) printf("will try allocating %u host buffers of %gMB each to overcommit %gMB\n", nbuf, alloc_max/MB, gmem/MB); hostbuf = calloc(nbuf, sizeof(cl_mem)); if (!hostbuf) { fprintf(stderr, "could not prepare support for %u buffers\n", nbuf); exit(1); } // allocate ‘host’ buffers for (i = 0; i < nbuf; ++i) { hostbuf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, alloc_max, NULL, &error); CHECK_ERROR("allocating host buffer"); printf("host buffer %u allocated\n", i); error = clEnqueueMigrateMemObjects(q, 1, hostbuf + i, CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED, 0, NULL, NULL); CHECK_ERROR("migrating buffer to host"); printf("buffer %u migrated to host\n", i); } // allocate ‘device’ buffers for (i = 0; i < 2; ++i) { devbuf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_max, NULL, &error); CHECK_ERROR("allocating devbuffer"); printf("dev buffer %u allocated\n", i); if (i == 0) { float patt = 0; error = clEnqueueFillBuffer(q, devbuf[0], &patt, sizeof(patt), 0, nels*sizeof(patt), 0, NULL, &mem_evt); CHECK_ERROR("enqueueing memset"); } } error = clWaitForEvents(1, &mem_evt); CHECK_ERROR("waiting for buffer fill"); clReleaseEvent(mem_evt); mem_evt = NULL; // use the buffers for (i = 0; i < nbuf; ++i) { printf("testing buffer %u\n", i); // for each buffer, we do a setup on CPU and then use it as second // argument for the kernel hbuf = clEnqueueMapBuffer(q, hostbuf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, alloc_max, 0, NULL, NULL, &error); CHECK_ERROR("mapping buffer"); for (e = 0; e < nels; ++e) hbuf[e] = i; error = clEnqueueUnmapMemObject(q, hostbuf[i], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer"); hbuf = NULL; // copy ‘host’ to ‘device’ buffer clEnqueueCopyBuffer(q, hostbuf[i], devbuf[1], 0, 0, alloc_max, 0, NULL, NULL); // make sure all pending actions are completed error = clFinish(q); CHECK_ERROR("settling down"); clSetKernelArg(k, 0, sizeof(cl_mem), devbuf); clSetKernelArg(k, 1, sizeof(cl_mem), devbuf + 1); clSetKernelArg(k, 2, sizeof(nels), &nels); error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm, 0, NULL, &krn_evt); CHECK_ERROR("enqueueing kernel"); error = clEnqueueCopyBuffer(q, devbuf[0], hostbuf[0], 0, 0, alloc_max, 1, &krn_evt, &mem_evt); CHECK_ERROR("copying data to host"); expected = i*(i+1)/2.0f; hbuf = clEnqueueMapBuffer(q, hostbuf[0], CL_TRUE, CL_MAP_READ, 0, alloc_max, 1, &mem_evt, NULL, &error); CHECK_ERROR("mapping buffer 0"); for (e = 0; e < nels; ++e) if (hbuf[e] != expected) { fprintf(stderr, "mismatch @ %u: %g instead of %g\n", e, hbuf[e], expected); exit(1); } error = clEnqueueUnmapMemObject(q, hostbuf[0], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer 0"); hbuf = NULL; clReleaseEvent(krn_evt); clReleaseEvent(mem_evt); krn_evt = mem_evt = NULL; } for (i = 1; i <= 2; ++i) { clReleaseMemObject(devbuf[2 - i]); printf("dev buffer %u freed\n", nbuf - i); } for (i = 1; i <= nbuf; ++i) { clReleaseMemObject(hostbuf[nbuf - i]); printf("host buffer %u freed\n", nbuf - i); } return 0; }
int clDevicesNum(void) { cl_int status; char pbuff[256]; cl_uint numDevices; cl_uint numPlatforms; int most_devices = -1; cl_platform_id *platforms; cl_platform_id platform = NULL; unsigned int i, mdplatform = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); /* If this fails, assume no GPUs. */ if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clGetPlatformsIDs failed (no OpenCL SDK installed?)", status); return -1; } if (numPlatforms == 0) { applog(LOG_ERR, "clGetPlatformsIDs returned no platforms (no OpenCL SDK installed?)"); return -1; } platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status); return -1; } for (i = 0; i < numPlatforms; i++) { if (opt_platform_id >= 0 && (int)i != opt_platform_id) continue; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status); return -1; } platform = platforms[i]; applog(LOG_INFO, "CL Platform %d vendor: %s", i, pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform %d name: %s", i, pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform %d version: %s", i, pbuff); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_INFO, "Error %d: Getting Device IDs (num)", status); continue; } applog(LOG_INFO, "Platform %d devices: %d", i, numDevices); if ((int)numDevices > most_devices) { most_devices = numDevices; mdplatform = i; } if (numDevices) { unsigned int j; cl_device_id *devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); for (j = 0; j < numDevices; j++) { clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); applog(LOG_INFO, "\t%i\t%s", j, pbuff); } free(devices); } } if (opt_platform_id < 0) opt_platform_id = mdplatform;; return most_devices; }