int main (){ cl_platform_id clPlatform; cl_device_id clDevice; cl_context clContext; cl_command_queue clQueue; cl_program clProgram; int isMic=0; cl_uint numDevices; cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); cl_int err; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); //Check for MIC if GPU is not found if (err != CL_SUCCESS) { err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices); isMic = 1; } if (err != CL_SUCCESS) { fprintf(stderr, "[ERROR in OpenCLDriver::HI_get_num_devices()] Failed to get device IDs for type \n"); } cl_device_id devices[numDevices]; clGetPlatformIDs(1, &clPlatform, NULL); if(isMic) clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL); else clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); for(int i=0; i< numDevices; i++) { clDevice = devices[i]; FILE *fp; char *source_str; size_t source_size; char filename[] = "openarc_kernel.cl"; fp = fopen(filename, "r"); if (!fp) { fprintf(stderr, "[INFO: in OpenCL binary creation] Failed to read the kernel file %s, so skipping binary generation for OpenCL devices %d\n", filename, i); break; } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); cl_int err; clContext = clCreateContext( NULL, 1, &clDevice, NULL, NULL, &err); if(err != CL_SUCCESS) { fprintf(stderr, "[ERROR in OpenCL binary creation] failed to create OPENCL context with error %d (OPENCL GPU)\n", err); } clQueue = clCreateCommandQueue(clContext, clDevice, 0, &err); if(err != CL_SUCCESS) { fprintf(stderr, "[ERROR in OpenCL binary creation] failed to create OPENCL queue with error %d (OPENCL GPU)\n", err); } char cBuffer[1024]; char *cBufferN; clGetDeviceInfo(clDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); cBufferN = deblank(cBuffer); std::string binaryName = std::string("openarc_kernel_") + cBufferN + std::string(".ptx"); clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&source_str, (const size_t *)&source_size, &err); if(err != CL_SUCCESS) { fprintf(stderr, "[ERROR in OpenCL binary creation] failed to create OPENCL program with error %d (OPENCL GPU)\n", err); } char *envVar; envVar = getenv("OPENARC_JITOPTION"); err = clBuildProgram(clProgram, 1, &clDevice, envVar, NULL, NULL); #if PRINT_LOG == 0 if(err != CL_SUCCESS) { printf("[ERROR in OpenCL binary creation] Error in clBuildProgram, Line %u in file %s : %d!!!\n\n", __LINE__, __FILE__, err); if (err == CL_BUILD_PROGRAM_FAILURE) { // Determine the size of the log size_t log_size; clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); } exit(1); } #else // Determine the size of the log size_t log_size; clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); #endif size_t size; err = clGetProgramInfo( clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL ); if(err != CL_SUCCESS) { fprintf(stderr, "[ERROR in OpenCL binary creation] failed to get OPENCL program info error %d (OPENCL GPU)\n", err); } unsigned char * binary = new unsigned char [size]; //#ifdef NVIDIA_GPU //err = clGetProgramInfo( clProgram, CL_PROGRAM_BINARIES, size, &binary, NULL ); //#else err = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binary, NULL); //#endif if(err != CL_SUCCESS) { fprintf(stderr, "[ERROR in OpenCL binary creation] failed to dump OPENCL program binary error %d (OPENCL GPU)\n", err); } FILE * fpbin = fopen(binaryName.c_str(), "wb" ); fwrite(binary, 1 , size, fpbin); fclose(fpbin); delete[] binary; } #ifdef NVIDIA_GPU //Generate ptx files for .cu, only if nvcc is found on the system if (system("which nvcc")==0){ CUresult err; int major, minor; CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; int numDevices; cudaGetDeviceCount(&numDevices); for(int i=0 ; i < numDevices; i++) { cuDeviceGet(&cuDevice, i); #if CUDA_VERSION >= 5000 cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice); cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice); #else cuDeviceComputeCapability(&major, &minor, cuDevice); #endif std::stringstream ss; ss << major; ss << minor; std::string version = ss.str(); std::string ptxName = std::string("openarc_kernel_") + version + std::string(".ptx"); std::string command = std::string("nvcc $OPENARC_JITOPTION -arch=sm_") + version + std::string(" openarc_kernel.cu -ptx -o ") + ptxName; system(command.c_str()); } } #endif }
int main(){ cl_platform_id *platforms; cl_uint platforms_n; clGetPlatformIDs(0, NULL, &platforms_n); platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*platforms_n); clGetPlatformIDs(platforms_n, platforms, &platforms_n); printf("There are %d platforms\n", platforms_n); int i = 0; char re[1024]; for(i=0; i<platforms_n; i++){ printf("Platform: %d\n", i); clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024, re, NULL); printf("CL_PLATFORM_VENDOR: %s\n", re); clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 1024, re, NULL); printf("CL_PLATFORM_NAME: %s\n", re); clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 1024, re, NULL); printf("CL_PLATFORM_VERSION: %s\n", re); clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 1024, re, NULL); printf("CL_PLATFORM_PROFILE: %s\n", re); clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 1024, re, NULL); printf("CL_PLATFORM_EXTENSIONS: %s\n", re); cl_device_id *devices; cl_uint devices_n; clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &devices_n); cl_uint uint; cl_ulong ulong; devices = (cl_device_id*)malloc(sizeof(cl_device_id)*devices_n); clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, devices_n, devices, &devices_n); for(int j = 0; j < devices_n; j++ ){ char string[1024]; printf("\tPlatform: %d, devices: %d\n", i, j); clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 1024, string, NULL); printf("\tCL_DEVICE_NAME: %s\n", string); clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 1024, string, NULL); printf("\tCL_DEVICE_VENDOR: %s\n", string); clGetDeviceInfo(devices[j], CL_DEVICE_EXTENSIONS, 1024, string, NULL); printf("\tCL_DEVICE_EXTENSIONS: %s\n", string); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uint), &uint, NULL); printf("\tCL_DEVICE_MAX_COMPUTE_UNITS: %d\n", uint); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(uint), &uint, NULL); printf("\tCL_DEVICE_MAX_CLOCK_FREQUENCY: %d\n", uint); clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(ulong), &ulong, NULL); printf("\tCL_Device_LOCAL_MEM_SIZE: %lu\n", ulong); clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(ulong), &ulong, NULL); printf("\tCL_DEVICE_GLOBAL_MEM_SIZE: %lu\n", ulong); } free(devices); } free(platforms); return EXIT_SUCCESS; }
OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) : OpenCLDeviceBase(info, stats, background_) { background = background_; /* Initialize cl_mem variables. */ kgbuffer = NULL; sd = NULL; sd_DL_shadow = NULL; rng_coop = NULL; throughput_coop = NULL; L_transparent_coop = NULL; PathRadiance_coop = NULL; Ray_coop = NULL; PathState_coop = NULL; Intersection_coop = NULL; ray_state = NULL; AOAlpha_coop = NULL; AOBSDF_coop = NULL; AOLightRay_coop = NULL; BSDFEval_coop = NULL; ISLamp_coop = NULL; LightRay_coop = NULL; Intersection_coop_shadow = NULL; #ifdef WITH_CYCLES_DEBUG debugdata_coop = NULL; #endif work_array = NULL; /* Queue. */ Queue_data = NULL; Queue_index = NULL; use_queues_flag = NULL; per_sample_output_buffers = NULL; per_thread_output_buffer_size = 0; hostRayStateArray = NULL; PathIteration_times = PATH_ITER_INC_FACTOR; #ifdef __WORK_STEALING__ work_pool_wgs = NULL; max_work_groups = 0; #endif current_max_closure = -1; first_tile = true; /* Get device's maximum memory that can be allocated. */ ciErr = clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &total_allocatable_memory, NULL); assert(ciErr == CL_SUCCESS); if(platform_name == "AMD Accelerated Parallel Processing") { /* This value is tweak-able; AMD platform does not seem to * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE * is considered for further computation. */ total_allocatable_memory /= 2; } }
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads) { /* initialize highlighting */ if (!g_highlightInitialized) { if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead); if (g_highlightedNodes) MEM_freeN(g_highlightedNodes); g_highlightedNodesRead = NULL; g_highlightedNodes = NULL; COM_startReadHighlights(); g_highlightInitialized = true; } #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE /* deinitialize if number of threads doesn't match */ if (g_cpudevices.size() != num_cpu_threads) { Device *device; while (g_cpudevices.size() > 0) { device = g_cpudevices.back(); g_cpudevices.pop_back(); device->deinitialize(); delete device; } g_cpuInitialized = false; } /* initialize CPU threads */ if (!g_cpuInitialized) { for (int index = 0; index < num_cpu_threads; index++) { CPUDevice *device = new CPUDevice(); device->initialize(); g_cpudevices.push_back(device); } g_cpuInitialized = true; } #ifdef COM_OPENCL_ENABLED /* deinitialize OpenCL GPU's */ if (use_opencl && !g_openclInitialized) { g_context = NULL; g_program = NULL; if (!OCL_init()) /* this will check for errors and skip if already initialized */ return; if (clCreateContextFromType) { cl_uint numberOfPlatforms = 0; cl_int error; error = clGetPlatformIDs(0, 0, &numberOfPlatforms); if (error == -1001) { } /* GPU not supported */ else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } if (G.f & G_DEBUG) printf("%d number of platforms\n", numberOfPlatforms); cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__); error = clGetPlatformIDs(numberOfPlatforms, platforms, 0); unsigned int indexPlatform; for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) { cl_platform_id platform = platforms[indexPlatform]; cl_uint numberOfDevices = 0; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices); if (numberOfDevices <= 0) continue; cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0); g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL}; g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error); error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0); if (error != CL_SUCCESS) { cl_int error2; size_t ret_val_size = 0; printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } build_log[ret_val_size] = '\0'; printf("%s", build_log); MEM_freeN(build_log); } else { unsigned int indexDevices; for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) { cl_device_id device = cldevices[indexDevices]; cl_int vendorID = 0; cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); } OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID); clDevice->initialize(); g_gpudevices.push_back(clDevice); } } MEM_freeN(cldevices); } MEM_freeN(platforms); } g_openclInitialized = true; } #endif #endif }
void opencl_info() { cl_int err_code; cl_platform_id *platforms; cl_device_type device_type; cl_uint num_devices; cl_device_id *devices; // Get OpenCL platforms // - Get the number of available platforms cl_uint num_platforms; err_code = clGetPlatformIDs(0, NULL, &num_platforms); clu_CheckError(err_code, "clGetPlatformIDs() for num_platforms"); if (num_platforms == 0) { fprintf(stderr, "No OpenCL platform!\n"); exit(EXIT_FAILURE); } // - Get platform IDs platforms = (cl_platform_id *)malloc(num_platforms*sizeof(cl_platform_id)); err_code = clGetPlatformIDs(num_platforms, platforms, NULL); clu_CheckError(err_code, "clGetPlatformIDs()"); // Get platform informations printf("\nNumber of platforms: %u\n\n", num_platforms); char tmp_buf[1024]; for (cl_uint i = 0; i < num_platforms; i++) { printf("platform: %u\n", i); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 1024, &tmp_buf, NULL); clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_NAME"); printf("- CL_PLATFORM_NAME : %s\n", tmp_buf); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024, &tmp_buf, NULL); clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_VENDOR"); printf("- CL_PLATFORM_VENDOR : %s\n", tmp_buf); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 1024, &tmp_buf, NULL); clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_PROFILE"); printf("- CL_PLATFORM_PROFILE : %s\n", tmp_buf); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 1024, &tmp_buf, NULL); clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_VERSION"); printf("- CL_PLATFORM_VERSION : %s\n", tmp_buf); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 1024, &tmp_buf, NULL); clu_CheckError(err_code,"clGetPlatformInfo() for CL_PLATFORM_EXTENSIONS"); printf("- CL_PLATFORM_EXTENSIONS: %s\n", tmp_buf); printf("\n"); // Get the number of devices err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); clu_CheckError(err_code, "clGetDeviceIDs for num_devices"); if (num_devices == 0) { fprintf(stderr, "No OpenCL device in this platform!\n"); exit(EXIT_FAILURE); } printf("Number of devices: %u\n", num_devices); // Get the default device cl_device_id default_device; cl_uint num_defaults; err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_DEFAULT, 1, &default_device, &num_defaults); clu_CheckError(err_code, "clGetDeviceIDs() for CL_DEVICE_TYPE_DEFAULT"); if (num_defaults != 1) { printf("- # of default devices: %u\n", num_defaults); } // Get device IDs devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); clu_CheckError(err_code, "clGetDeviceIDs()"); for (cl_uint k = 0; k < num_devices; k++) { printf("device: %u (", k); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); if (device_type & CL_DEVICE_TYPE_CPU) printf("CL_DEVICE_TYPE_CPU"); if (device_type & CL_DEVICE_TYPE_GPU) printf("CL_DEVICE_TYPE_GPU"); if (device_type & CL_DEVICE_TYPE_ACCELERATOR) printf("CL_DEVICE_TYPE_ACCELERATOR"); if (device_type & CL_DEVICE_TYPE_DEFAULT) printf("CL_DEVICE_TYPE_DEFAULT"); printf(")"); if (default_device == devices[k]) printf(" default"); printf("\n"); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_NAME, 1024, tmp_buf, NULL); printf(" - CL_DEVICE_NAME : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_VENDOR, 1024, tmp_buf, NULL); printf(" - CL_DEVICE_VENDOR : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DRIVER_VERSION, 1024, tmp_buf, NULL); printf(" - CL_DRIVER_VERSION : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_PROFILE, 1024, tmp_buf, NULL); printf(" - CL_DEVICE_PROFILE : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_VERSION, 1024, tmp_buf, NULL); printf(" - CL_DEVICE_VERSION : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_EXTENSIONS, 1024, tmp_buf, NULL); //CL_DEVICE_MAX_COMPUTE_UNITS //CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS //CL_DEVICE_MAX_WORK_GROUP_SIZE //CL_DEVICE_MAX_WORK_ITEM_SIZES // cl_uint usize; err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(usize), &usize, NULL); printf(" - CL_DEVICE_MAX_COMPUTE_UNITS : %d\n", usize); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(usize), &usize, NULL); printf(" - CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS : %d\n", usize); size_t size; err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size), &size, NULL); printf(" - CL_DEVICE_MAX_WORK_GROUP_SIZE : %d\n",size); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size), &size, NULL); printf(" - CL_DEVICE_MAX_WORK_ITEM_SIZES : %d\n", size); printf("\n"); } free(devices); printf("\n"); } free(platforms); }
int OpenCLDevice::getMaxWorkItemDimensions() { cl_uint value; check_error(clGetDeviceInfo(my_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, (sizeof(cl_uint)), &value, NULL)); return value; }
int main() { /* Host/device data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel init_kernel, stage_kernel, scale_kernel; cl_int err, i; size_t global_size, local_size; cl_ulong local_mem_size; /* Data and buffer */ int direction; unsigned int num_points, points_per_group, stage; float data[NUM_POINTS*2]; double error, check_input[NUM_POINTS][2], check_output[NUM_POINTS][2]; cl_mem data_buffer; /* Initialize data */ srand((unsigned int)time(0)); for(i=0; i<NUM_POINTS; i++) { data[2*i] = (float)rand(); data[2*i+1] = (float)rand(); check_input[i][0] = (float)data[2*i]; check_input[i][1] = (float)data[2*i+1]; } /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program */ program = build_program(context, device, PROGRAM_FILE); /* Create kernels for the FFT */ init_kernel = clCreateKernel(program, INIT_FUNC, &err); if(err < 0) { printf("Couldn't create the initial kernel: %d", err); exit(1); }; stage_kernel = clCreateKernel(program, STAGE_FUNC, &err); if(err < 0) { printf("Couldn't create the stage kernel: %d", err); exit(1); }; scale_kernel = clCreateKernel(program, SCALE_FUNC, &err); if(err < 0) { printf("Couldn't create the scale kernel: %d", err); exit(1); }; /* Create buffer */ data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, 2*NUM_POINTS*sizeof(float), data, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Determine maximum work-group size */ err = clGetKernelWorkGroupInfo(init_kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL); if(err < 0) { perror("Couldn't find the maximum work-group size"); exit(1); }; /* Determine local memory size */ err = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL); if(err < 0) { perror("Couldn't determine the local memory size"); exit(1); }; /* Initialize kernel arguments */ direction = DIRECTION; num_points = NUM_POINTS; points_per_group = (unsigned int)(local_mem_size/(2*sizeof(float))); if(points_per_group > num_points) points_per_group = num_points; /* Set kernel arguments */ err = clSetKernelArg(init_kernel, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(init_kernel, 1, (size_t)local_mem_size, NULL); err |= clSetKernelArg(init_kernel, 2, sizeof(points_per_group), &points_per_group); err |= clSetKernelArg(init_kernel, 3, sizeof(num_points), &num_points); err |= clSetKernelArg(init_kernel, 4, sizeof(direction), &direction); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue initial kernel */ global_size = (num_points/points_per_group)*local_size; err = clEnqueueNDRangeKernel(queue, init_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the initial kernel"); exit(1); } /* Enqueue further stages of the FFT */ if(num_points > points_per_group) { err = clSetKernelArg(stage_kernel, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(stage_kernel, 2, sizeof(points_per_group), &points_per_group); err |= clSetKernelArg(stage_kernel, 3, sizeof(direction), &direction); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; for(stage = 2; stage <= num_points/points_per_group; stage <<= 1) { clSetKernelArg(stage_kernel, 1, sizeof(stage), &stage); err = clEnqueueNDRangeKernel(queue, stage_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the stage kernel"); exit(1); } } } /* Scale values if performing the inverse FFT */ if(direction < 0) { err = clSetKernelArg(scale_kernel, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(scale_kernel, 1, sizeof(points_per_group), &points_per_group); err |= clSetKernelArg(scale_kernel, 2, sizeof(num_points), &num_points); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; err = clEnqueueNDRangeKernel(queue, scale_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the initial kernel"); exit(1); } } /* Read the results */ err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 2*NUM_POINTS*sizeof(float), data, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Compute accurate values */ if(direction > 0) fft(NUM_POINTS, check_input, check_output); else ifft(NUM_POINTS, check_output, check_input); /* Determine error */ error = 0.0; for(i=0; i<NUM_POINTS; i++) { error += fabs(check_output[i][0] - data[2*i])/fabs(check_output[i][0]); error += fabs(check_output[i][1] - data[2*i+1])/fabs(check_output[i][1]); } error = error/(NUM_POINTS*2); /* Display check results */ printf("%u-point ", num_points); if(direction > 0) printf("FFT "); else printf("IFFT "); printf("completed with %lf average relative error.\n", error); /* Deallocate resources */ clReleaseMemObject(data_buffer); clReleaseKernel(init_kernel); clReleaseKernel(stage_kernel); clReleaseKernel(scale_kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
bool initOpenCL(ComputeEnv *env) { int r = cllib_init(); if (r < 0) { return false; } cl_uint num_plt; cl_platform_id plts[16]; clGetPlatformIDs(16, plts, &num_plt); bool found = false; cl_int err; cl_platform_id platform; cl_context context; cl_device_id dev; cl_command_queue queue; cl_kernel ker_filter, ker_filter_in1_out32, ker_filter_in128_out1; cl_kernel ker_filter_in3_out32, ker_filter_in128_out3; cl_program program = 0; for (unsigned int i=0; i<num_plt; i++) { size_t sz; cl_uint num_dev; clGetPlatformInfo(plts[i], CL_PLATFORM_NAME, 0, nullptr, &sz); std::vector<char> name(sz); clGetPlatformInfo(plts[i], CL_PLATFORM_NAME, sz, &name[0], &sz); bool is_amd = strstr(&name[0], "AMD") != NULL; bool is_apple = strstr(&name[0], "Apple") != NULL; //bool is_intel = strstr(&name[0], "Intel") != NULL; //bool is_nvidia = strstr(&name[0], "NVIDIA") != NULL; if (!is_amd && !is_apple) { continue; } clGetDeviceIDs(plts[i], CL_DEVICE_TYPE_GPU, 0, nullptr, &num_dev); if (num_dev == 0) { continue; } std::vector<cl_device_id> devs(num_dev); clGetDeviceIDs(plts[i], CL_DEVICE_TYPE_GPU, num_dev, &devs[0], &num_dev); platform = plts[i]; dev = devs[0]; cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(plts[i]), 0}; cl_context ctxt = clCreateContext(props, 1, &devs[0], NULL, NULL, &err); if (err != CL_SUCCESS) { continue; } context = ctxt; found = true; break; } if (!found) { return false; } size_t dev_name_len; clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, nullptr, &dev_name_len); std::vector<char> dev_name(dev_name_len+1); clGetDeviceInfo(dev, CL_DEVICE_NAME, dev_name_len, &dev_name[0], &dev_name_len); bool bin_avaiable = false; #if defined __linux || _WIN32 #ifdef __linux ssize_t path_len = 4; char *self_path = (char*)malloc(path_len+1); while (1) { ssize_t r = readlink("/proc/self/exe", self_path, path_len); if (r < path_len) { self_path[r] = '\0'; break; } path_len *= 2; self_path = (char*)realloc(self_path, path_len+1); } struct stat self_st; stat(self_path, &self_st); self_path = dirname(self_path); #else size_t path_len = 4; char *self_path = (char*)malloc(path_len+1); DWORD len; while (1) { len = GetModuleFileName(NULL, self_path, path_len); if (len > 0 && len != path_len) { break; } path_len *= 2; self_path = (char*)realloc(self_path, path_len+1); } WIN32_FIND_DATA self_st; HANDLE finder = FindFirstFile(self_path, &self_st); FindClose(finder); for (int si=len-1; si>=0; si--) { if (self_path[si] == '\\') { self_path[si] = '\0'; break; } } #endif std::string bin_path = std::string(self_path) + "/" + &dev_name[0] + ".bin"; FILE *binfp = fopen(bin_path.c_str(), "rb"); if (binfp) { #ifdef __linux struct stat bin_st; stat(bin_path.c_str(), &bin_st); bool old = false; if (bin_st.st_mtim.tv_sec < self_st.st_mtim.tv_sec) { old = true; } if (bin_st.st_mtim.tv_sec == self_st.st_mtim.tv_sec) { if (bin_st.st_mtim.tv_nsec < self_st.st_mtim.tv_nsec) { old = true; } } size_t bin_sz = bin_st.st_size; #else WIN32_FIND_DATA bin_st; HANDLE finder = FindFirstFile(bin_path.c_str(), &bin_st); FindClose(finder); bool old = false; uint64_t self_time = (((uint64_t)self_st.ftLastWriteTime.dwHighDateTime)<<32) | ((uint64_t)self_st.ftLastWriteTime.dwLowDateTime); uint64_t bin_time = (((uint64_t)bin_st.ftLastWriteTime.dwHighDateTime)<<32) | ((uint64_t)bin_st.ftLastWriteTime.dwLowDateTime); if (bin_time < self_time) { old = true; } size_t bin_sz = bin_st.nFileSizeLow; #endif if (!old) { unsigned char *bin = (unsigned char*)malloc(bin_sz); size_t rem = bin_sz; unsigned char *p = bin; while (rem) { size_t rsz = fread(p, 1, rem, binfp); if (rsz <= 0) { break; } rem -= rsz; p += rsz; } if (rem == 0) { cl_int err; program = clCreateProgramWithBinary(context, 1, &dev, &bin_sz, (const unsigned char**)&bin, NULL, &err); if (err == CL_SUCCESS) { bin_avaiable = true; } } free(bin); } fclose(binfp); } #endif if (! bin_avaiable) { const char *source[1] = {prog}; size_t src_len[1] = {sizeof(prog)-1}; program = clCreateProgramWithSource(context, 1, source, src_len, &err); if (err != CL_SUCCESS) { clReleaseContext(context); return false; } } #if defined __linux || defined _WIN32 free(self_path); #endif err = clBuildProgram(program, 1, &dev, "" , nullptr, nullptr); if (err != CL_SUCCESS) { size_t log_len; clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_len); std::vector<char> log(log_len+1); clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_len, &log[0], &log_len); log[log_len] = '\0'; puts(&log[0]); clReleaseProgram(program); clReleaseContext(context); return false; } #if defined __linux || _WIN32 if (!bin_avaiable) { size_t binsz; size_t ret_len; clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsz), &binsz, &ret_len); char *buffer = new char [binsz]; char *ptrs[1]; ptrs[0] = buffer; clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(ptrs), ptrs, &ret_len); FILE *fp = fopen(bin_path.c_str(), "wb"); size_t rem = binsz; char *p = buffer; while (rem) { size_t wsz = fwrite(p, 1, rem, fp); if (wsz <= 0) { fclose(fp); unlink(bin_path.c_str()); fp=NULL; break; } rem -= wsz; p += wsz; } if (fp) { fclose(fp); } delete [] buffer; } #endif ker_filter = clCreateKernel(program, "filter", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); return false; } ker_filter_in1_out32 = clCreateKernel(program, "filter_in1_out32", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); return false; } ker_filter_in3_out32 = clCreateKernel(program, "filter_in3_out32", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); return false; } ker_filter_in128_out1 = clCreateKernel(program, "filter_in128_out1", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); return false; } ker_filter_in128_out3 = clCreateKernel(program, "filter_in128_out3", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); return false; } queue = clCreateCommandQueue(context, dev, 0, &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); return false; } env->num_cl_dev = 1; env->cl_dev_list = new OpenCLDev[1]; env->cl_dev_list[0].platform = platform; env->cl_dev_list[0].context = context; env->cl_dev_list[0].devid = dev; env->cl_dev_list[0].queue = queue; env->cl_dev_list[0].program = program; env->cl_dev_list[0].ker_filter = ker_filter; env->cl_dev_list[0].ker_filter_in1_out32 = ker_filter_in1_out32; env->cl_dev_list[0].ker_filter_in128_out1 = ker_filter_in128_out1; env->cl_dev_list[0].ker_filter_in3_out32 = ker_filter_in3_out32; env->cl_dev_list[0].ker_filter_in128_out3 = ker_filter_in128_out3; env->cl_dev_list[0].name = &dev_name[0]; return true; }
int main(int argc, const char** argv) { size_t x = 512, y = 250000; //y has to be a multiple of ciDeviceCount! struct svm_node* px = (struct svm_node*)malloc((x+1)*sizeof(struct svm_node)); gen_data(px, x, 1, 3); struct svm_node* py = (struct svm_node*)malloc((x+1)*y*sizeof(struct svm_node)); for(size_t i = 0; i < y; ++i) { struct svm_node* tmp = py+i*(x+1); gen_data(tmp, x, 3,2); } dtype* result = (dtype*)malloc(y*sizeof(dtype)); int* pyLength = (int*)malloc(y*sizeof(int)); for(size_t i = 0; i < y; ++i) { for(size_t j = 0; py[i*(x+1)+j].index >= 0; ++j) pyLength[i] = py[i*(x+1)+j].index; ++pyLength[i]; } cl_int err = CL_SUCCESS; // cl_platform_id platform = NULL; // cl_uint ciDeviceCount = 0; // cl_device_id *device = NULL; // retrieve devices cl_platform_id platform; err = clGetPlatformIDs(1, &platform, NULL); cl_device_id device; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); size_t localDim = 256l; size_t globalDim = localDim*y; /* device = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, ciDeviceCount, device, NULL); if (err != CL_SUCCESS) { printf("Failed to get devices:\n%s\n", oclErrorString(err)); return -1; } */ //Create the context cl_context context1 = clCreateContext(0, 1, &device, NULL, NULL, &err); if(err != CL_SUCCESS) { printf("Context creation failed:\n%d\n", err); return -1; } // create a command queue for first device the context reported cl_command_queue queue = clCreateCommandQueue(context1, device, 0, 0); // load program from disk char *tmp = strdup(argv[0]); char* my_dir = dirname(tmp); // size_t program_length; char path[256]; snprintf(path, PATH_MAX - 1, "%s/vecops.cl", my_dir); cl_program vecops = load_kernel(path, context1); if(err != CL_SUCCESS) { printf("Program creation failed:\n%d\n", (err)); return -1; } err = clBuildProgram(vecops, 0, NULL, "-I.", NULL, NULL); if(err != CL_SUCCESS) { err = clGetProgramBuildInfo(vecops, device, CL_PROGRAM_BUILD_LOG, 8192, buffer, NULL); if(err != CL_SUCCESS) printf("Cannot get build info: %d\n", (err)); printf("Build log:\n%s\n", buffer); } // create kernel cl_kernel sparsedot_kernel; #if version == 1 sparsedot_kernel = clCreateKernel(vecops, "sparsedot1_kernel", &err); #endif #if version == 2 sparsedot_kernel = clCreateKernel(vecops, "sparsedot4_kernel", &err); #endif #if version == 3 sparsedot_kernel = clCreateKernel(vecops, "sparsedot3_kernel", &err); #endif if (err != CL_SUCCESS) { printf("Kernel creation failed:\n%d\n", (err)); return -1; } // allocate memory on the devices cl_mem px_d, py_d, result_d, pyLength_d; #if version == 1 px_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, (x+1) * sizeof(struct svm_node), 0, &err); #endif #if version == 2 || version == 3 //unpack px int size = px[x-1].index+1; for(size_t i = 0; i < y; ++i) size = size > pyLength[i] ? size : pyLength[i]; dtype* px_u = (dtype*)calloc(size, sizeof(dtype)); unpack(px, px_u); printf("px size: %d\n", size); #endif #if version == 3 size_t height, width; clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &height, 0); clGetDeviceInfo(Device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &width, 0); size_t region[3]; region[2] = 1; region[0] = min(4, size); region[1] = (size+2-1) / 4; cl_image_format px_format; px_format.image_channel_order = CL_R; px_format.image_channel_data_type = CL_FLOAT; #endif #if version == 2 px_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, size * sizeof(dtype), 0, &err); #endif #if version == 3 px_d = clCreateImage2D(context1, CL_MEM_READ_ONLY, &px_format, region[0], region[1], 0, 0, &err); #endif if(err != CL_SUCCESS) { printf("Failed to allocate px:\n%d\n", (err)); return -1; } py_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, (x+1) * y * sizeof(struct svm_node), 0, &err); if(err != CL_SUCCESS) { printf("Failed to allocate px:\n%d\n", (err)); return -1; } result_d = clCreateBuffer(context1, CL_MEM_WRITE_ONLY, y * sizeof(dtype), 0, 0); pyLength_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, y * sizeof(int), 0, 0); #if bench //start time measurement start_timer(0); #endif // copy host vectors to device err = CL_SUCCESS; err |= clEnqueueWriteBuffer(queue, py_d, CL_FALSE, 0, (x+1) * y * sizeof(struct svm_node), py, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, pyLength_d, CL_FALSE, 0, y * sizeof(int), pyLength, 0, NULL, NULL); #if version == 1 err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, (x+1) * sizeof(struct svm_node), px, 0, NULL, NULL); #endif #if version == 2 err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, size * sizeof(dtype), px_u, 0, NULL, NULL); #endif #if version == 3 size_t offset[] = {0,0,0}; err |= clEnqueueWriteImage(queue, px_d, CL_TRUE, offset, region, sizeof(dtype), 0, px_u, 0, 0, NULL); #endif clFinish(queue); if(err != CL_SUCCESS) { printf("Data transfer to GPU failed:\n%d\n", (err)); return -1; } #if bench stop_timer(0); start_timer(1); #endif // set kernel arguments clSetKernelArg(sparsedot_kernel, 0, sizeof(cl_mem), (void *) &px_d); clSetKernelArg(sparsedot_kernel, 1, sizeof(cl_mem), (void *) &py_d); clSetKernelArg(sparsedot_kernel, 2, sizeof(cl_mem), (void *) &result_d); clSetKernelArg(sparsedot_kernel, 3, sizeof(cl_mem), (void *) &pyLength_d); clSetKernelArg(sparsedot_kernel, 4, sizeof(cl_ulong), (void *) &x); clSetKernelArg(sparsedot_kernel, 5, sizeof(cl_ulong), (void *) &y); // clSetKernelArg(sparsedot_kernel, 6, sizeof(cl_float8)*localDim, 0); #if version == 3 clSetKernelArg(sparsedot_kernel, 7, sizeof(cl_long), (void *) ®ion[1]) ; clSetKernelArg(sparsedot_kernel, 8, sizeof(cl_long), (void *) ®ion[0]) ; #endif clFlush(queue); // start kernel err = clEnqueueNDRangeKernel(queue, sparsedot_kernel, 1, 0, &globalDim, &localDim, 0, NULL, 0); if(err != CL_SUCCESS) { printf("Kernel launch failed:\n%d\n", (err)); return -1; } clFinish(queue); #if bench stop_timer(1); start_timer(2); #endif cl_event result_gather; // Non-blocking copy of result from device to host err = clEnqueueReadBuffer(queue, result_d, CL_FALSE, 0, y * sizeof(dtype), result, 0, NULL, &result_gather); if(err != CL_SUCCESS) { printf("Reading result failed:\n%d\n", (err)); return -1; } // CPU sync with GPU clWaitForEvents(1, &result_gather); #if bench // stop GPU time measurement stop_timer(2); #endif //check result /* for(size_t i = 0; i < y; ++i) { printf("%f ", result[i]); } printf("\n"); */ #if bench start_timer(3); #endif bool correct = validate(px, py, result, x, y); #if bench stop_timer(3); printf("v%i; x: %lu, y: %lu\n", version, x, y); printf("CPU: %f, upcpy: %f DeviceCalc: %f, downcpy: %f\n", get_secs(3), get_secs(0), get_secs(1), get_secs(2)); #endif if(correct) printf("SUCCESS!\n"); //cleenup clReleaseKernel(sparsedot_kernel); clReleaseCommandQueue(queue); clReleaseEvent(result_gather); clReleaseMemObject(px_d); clReleaseMemObject(py_d); clReleaseMemObject(result_d); clReleaseMemObject(pyLength_d); // clReleaseDevice(device); free(px); #if version == 2 || version == 3 free(px_u); #endif free(py); free(result); return 0; }
//Maybe somebody could tell me how to use template when exporting a class from a dll. Probably not possible? cl_uint OclHost::getDeviceInfoInt(cl_device_info info) { cl_uint value = 0; clGetDeviceInfo(oclDevice, info, sizeof(value), &value, 0); return value; }
cl_ulong OclHost::getDeviceInfoLong(cl_device_info info) { cl_ulong value = 0; clGetDeviceInfo(oclDevice, info, sizeof(value), &value, 0); return value; }
OclHost::OclHost(int const device_type, int gpu_id, int const cpu_cores) : devType(device_type), maxGlobalMem(0), maxLocalMem(0) { // if (!isGPU()) { // gpu_id = 0; // } cl_int ciErrNum = CL_SUCCESS; Log.Verbose("Using device number %d", gpu_id); //#pragma omp critical // { if (contextUserCount == 0) { Log.Verbose("Creating ocl context."); // cl_uint ciDeviceCount = 0; cl_platform_id cpPlatform = NULL; cpPlatform = getPlatform(); //Get the devices //Get number of devices ciErrNum = clGetDeviceIDs(cpPlatform, devType, 0, NULL, &ciDeviceCount); checkClError("Couldn't get number of OpenCl devices. Error: ", ciErrNum); if (isGPU()) { //Getting device ids devices = (cl_device_id *) malloc( ciDeviceCount * sizeof(cl_device_id)); ciErrNum = clGetDeviceIDs(cpPlatform, devType, ciDeviceCount, devices, NULL); checkClError("Couldn't get OpenCl device ids. Error: ", ciErrNum); //Create context oclGpuContext = clCreateContext(0, ciDeviceCount, devices, NULL, NULL, &ciErrNum); checkClError("Couldn't create context. Error: ", ciErrNum); Log.Message("Context for GPU devices created."); Log.Message("%d GPU device(s) found: ", ciDeviceCount); for (int i = 0; i < ciDeviceCount; ++i) { char device_string[1024]; char driver_string[1024]; clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(driver_string), &driver_string, NULL); Log.Message("Device %d: %s (Driver: %s)", i, device_string, driver_string); } } else { if (ciDeviceCount > 1) { Log.Error("More than one CPU device found."); exit(-1); } cl_device_id device_id; ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); checkClError("Couldn't get CPU device id. Error: ", ciErrNum); Log.Message("%d CPU device found.", ciDeviceCount); char device_string[1024]; char driver_string[1024]; clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(driver_string), &driver_string, NULL); Log.Message("Device %d: %s (Driver: %s)", 0, device_string, driver_string); cl_device_partition_property props[3]; props[0] = CL_DEVICE_PARTITION_EQUALLY; // Equally props[1] = 1; // 4 compute units per sub-device props[2] = 0; devices = (cl_device_id *) malloc(256 * sizeof(cl_device_id)); ciErrNum = clCreateSubDevices(device_id, props, 256, devices, &ciDeviceCount); if (ciErrNum == -18) { ciDeviceCount = 1; devices[0] = device_id; } else { checkClError("Couldn't create sub-devices. Error: ", ciErrNum); } Log.Message("%d CPU cores available.", ciDeviceCount); //Create context oclGpuContext = clCreateContext(0, ciDeviceCount, devices, NULL, NULL, &ciErrNum); checkClError("Couldn't create context. Error: ", ciErrNum); } } contextUserCount += 1; //} if (!isGPU()) { gpu_id = gpu_id % ciDeviceCount; } oclDevice = devices[gpu_id]; //Create context //oclGpuContext = clCreateContext(0, 1, &oclDevice, NULL, NULL, &ciErrNum); //checkClError("Couldn't create context. Error: ", ciErrNum); // create command queue oclCommandQueue = clCreateCommandQueue(oclGpuContext, oclDevice, 0, &ciErrNum); checkClError("Couldn't create command queue for device: ", ciErrNum); }
int clDevicesNum(void) { cl_int status; char pbuff[256]; cl_uint numDevices; cl_uint numPlatforms; int most_devices = -1; cl_platform_id *platforms; cl_platform_id platform = NULL; unsigned int i, mdplatform = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); /* If this fails, assume no GPUs. */ if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clGetPlatformsIDs failed (no OpenCL SDK installed?)", status); return -1; } if (numPlatforms == 0) { applog(LOG_ERR, "clGetPlatformsIDs returned no platforms (no OpenCL SDK installed?)"); return -1; } platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status); return -1; } for (i = 0; i < numPlatforms; i++) { if (opt_platform_id >= 0 && (int)i != opt_platform_id) continue; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status); return -1; } platform = platforms[i]; applog(LOG_INFO, "CL Platform %d vendor: %s", i, pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform %d name: %s", i, pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform %d version: %s", i, pbuff); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_INFO, "Error %d: Getting Device IDs (num)", status); continue; } applog(LOG_INFO, "Platform %d devices: %d", i, numDevices); if ((int)numDevices > most_devices) { most_devices = numDevices; mdplatform = i; } if (numDevices) { unsigned int j; cl_device_id *devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); for (j = 0; j < numDevices; j++) { clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); applog(LOG_INFO, "\t%i\t%s", j, pbuff); } free(devices); } } if (opt_platform_id < 0) opt_platform_id = mdplatform;; return most_devices; }
_clState *initCl(unsigned int gpu, char *name, size_t nameSize) { _clState *clState = calloc(1, sizeof(_clState)); bool patchbfi = false, prog_built = false; struct cgpu_info *cgpu = &gpus[gpu]; cl_platform_id platform = NULL; char pbuff[256], vbuff[255]; cl_platform_id* platforms; cl_uint preferred_vwidth; cl_device_id *devices; cl_uint numPlatforms; cl_uint numDevices; cl_int status; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status); return NULL; } platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status); return NULL; } if (opt_platform_id >= (int)numPlatforms) { applog(LOG_ERR, "Specified platform that does not exist"); return NULL; } status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status); return NULL; } platform = platforms[opt_platform_id]; if (platform == NULL) { perror("NULL platform found!\n"); return NULL; } applog(LOG_INFO, "CL Platform vendor: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform name: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform version: %s", vbuff); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); return NULL; } if (numDevices > 0 ) { devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status); return NULL; } applog(LOG_INFO, "List of devices:"); unsigned int i; for (i = 0; i < numDevices; i++) { status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device Info", status); return NULL; } applog(LOG_INFO, "\t%i\t%s", i, pbuff); } if (gpu < numDevices) { status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device Info", status); return NULL; } applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); strncpy(name, pbuff, nameSize); } else { applog(LOG_ERR, "Invalid GPU %i", gpu); return NULL; } } else return NULL; cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status); return NULL; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); if (status != CL_SUCCESS) /* Try again without OOE enable */ clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status); return NULL; } /* Check for BFI INT support. Hopefully people don't mix devices with * and without it! */ char * extensions = malloc(1024); const char * camo = "cl_amd_media_ops"; char *find; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status); return NULL; } find = strstr(extensions, camo); if (find) clState->hasBitAlign = true; /* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */ char * devoclver = malloc(1024); const char * ocl10 = "OpenCL 1.0"; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status); return NULL; } find = strstr(devoclver, ocl10); if (!find) clState->hasOpenCL11plus = true; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status); return NULL; } applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status); return NULL; } applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status); return NULL; } applog(LOG_DEBUG, "Max mem alloc size is %u", cgpu->max_alloc); /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would * have otherwise created. The filename is: * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin * For scrypt the filename is: * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin */ char binaryfilename[255]; char filename[255]; char numbuf[16]; if (cgpu->kernel == KL_NONE) { if (opt_scrypt) { applog(LOG_INFO, "Selecting scrypt kernel"); clState->chosen_kernel = KL_SCRYPT; } else if (!strstr(name, "Tahiti") && /* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */ (strstr(vbuff, "844.4") || // Linux 64 bit ATI 2.6 SDK strstr(vbuff, "851.4") || // Windows 64 bit "" strstr(vbuff, "831.4") || strstr(vbuff, "898.1") || // 12.2 driver SDK strstr(vbuff, "923.1") || // 12.4 strstr(vbuff, "938.2") || // SDK 2.7 strstr(vbuff, "1113.2"))) {// SDK 2.8 applog(LOG_INFO, "Selecting diablo kernel"); clState->chosen_kernel = KL_DIABLO; /* Detect all 7970s, older ATI and NVIDIA and use poclbm */ } else if (strstr(name, "Tahiti") || !clState->hasBitAlign) { applog(LOG_INFO, "Selecting poclbm kernel"); clState->chosen_kernel = KL_POCLBM; /* Use phatk for the rest R5xxx R6xxx */ } else { applog(LOG_INFO, "Selecting phatk kernel"); clState->chosen_kernel = KL_PHATK; } cgpu->kernel = clState->chosen_kernel; } else { clState->chosen_kernel = cgpu->kernel; if (clState->chosen_kernel == KL_PHATK && (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") || strstr(vbuff, "831.4") || strstr(vbuff, "898.1") || strstr(vbuff, "923.1") || strstr(vbuff, "938.2") || strstr(vbuff, "1113.2"))) { applog(LOG_WARNING, "WARNING: You have selected the phatk kernel."); applog(LOG_WARNING, "You are running SDK 2.6+ which performs poorly with this kernel."); applog(LOG_WARNING, "Downgrade your SDK and delete any .bin files before starting again."); applog(LOG_WARNING, "Or allow cgminer to automatically choose a more suitable kernel."); } } /* For some reason 2 vectors is still better even if the card says * otherwise, and many cards lie about their max so use 256 as max * unless explicitly set on the command line. Tahiti prefers 1 */ if (strstr(name, "Tahiti")) preferred_vwidth = 1; else if (preferred_vwidth > 2) preferred_vwidth = 2; switch (clState->chosen_kernel) { case KL_POCLBM: strcpy(filename, POCLBM_KERNNAME".cl"); strcpy(binaryfilename, POCLBM_KERNNAME); break; case KL_PHATK: strcpy(filename, PHATK_KERNNAME".cl"); strcpy(binaryfilename, PHATK_KERNNAME); break; case KL_DIAKGCN: strcpy(filename, DIAKGCN_KERNNAME".cl"); strcpy(binaryfilename, DIAKGCN_KERNNAME); break; case KL_SCRYPT: strcpy(filename, SCRYPT_KERNNAME".cl"); strcpy(binaryfilename, SCRYPT_KERNNAME); /* Scrypt only supports vector 1 */ cgpu->vwidth = 1; break; case KL_NONE: /* Shouldn't happen */ case KL_DIABLO: strcpy(filename, DIABLO_KERNNAME".cl"); strcpy(binaryfilename, DIABLO_KERNNAME); break; } if (cgpu->vwidth) clState->vwidth = cgpu->vwidth; else { clState->vwidth = preferred_vwidth; cgpu->vwidth = preferred_vwidth; } if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) && clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt) clState->goffset = true; if (cgpu->work_size && cgpu->work_size <= clState->max_work_size) clState->wsize = cgpu->work_size; else if (opt_scrypt) clState->wsize = 256; else if (strstr(name, "Tahiti")) clState->wsize = 64; else clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth; cgpu->work_size = clState->wsize; #ifdef USE_SCRYPT if (opt_scrypt) { if (!cgpu->opt_lg) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); cgpu->lookup_gap = 2; } else cgpu->lookup_gap = cgpu->opt_lg; if (!cgpu->opt_tc) { unsigned int sixtyfours; sixtyfours = cgpu->max_alloc / 131072 / 64 - 1; cgpu->thread_concurrency = sixtyfours * 64; if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; if (cgpu->thread_concurrency > cgpu->shaders * 5) cgpu->thread_concurrency = cgpu->shaders * 5; } applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, cgpu->thread_concurrency); } else cgpu->thread_concurrency = cgpu->opt_tc; } #endif FILE *binaryfile; size_t *binary_sizes; char **binaries; int pl; char *source = file_contents(filename, &pl); size_t sourceSize[] = {(size_t)pl}; cl_uint slot, cpnd; slot = cpnd = 0; if (!source) return NULL; binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binary_sizes)) { applog(LOG_ERR, "Unable to calloc binary_sizes"); return NULL; } binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binaries)) { applog(LOG_ERR, "Unable to calloc binaries"); return NULL; } strcat(binaryfilename, name); if (clState->goffset) strcat(binaryfilename, "g"); if (opt_scrypt) { #ifdef USE_SCRYPT sprintf(numbuf, "lg%utc%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency); strcat(binaryfilename, numbuf); #endif } else { sprintf(numbuf, "v%d", clState->vwidth); strcat(binaryfilename, numbuf); } sprintf(numbuf, "w%d", (int)clState->wsize); strcat(binaryfilename, numbuf); sprintf(numbuf, "l%d", (int)sizeof(long)); strcat(binaryfilename, numbuf); strcat(binaryfilename, ".bin"); binaryfile = fopen(binaryfilename, "rb"); if (!binaryfile) { applog(LOG_DEBUG, "No binary found, generating from source"); } else { struct stat binary_stat; if (unlikely(stat(binaryfilename, &binary_stat))) { applog(LOG_DEBUG, "Unable to stat binary, generating from source"); fclose(binaryfile); goto build; } if (!binary_stat.st_size) goto build; binary_sizes[slot] = binary_stat.st_size; binaries[slot] = (char *)calloc(binary_sizes[slot], 1); if (unlikely(!binaries[slot])) { applog(LOG_ERR, "Unable to calloc binaries"); fclose(binaryfile); return NULL; } if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) { applog(LOG_ERR, "Unable to fread binaries"); fclose(binaryfile); free(binaries[slot]); goto build; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status); fclose(binaryfile); free(binaries[slot]); goto build; } fclose(binaryfile); applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename); goto built; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// build: clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status); return NULL; } /* create a cl program executable for all the devices specified */ char *CompilerOptions = calloc(1, 256); #ifdef USE_SCRYPT if (opt_scrypt) sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize); else #endif { sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", (int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth); } applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize); if (clState->vwidth > 1) applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth); if (clState->hasBitAlign) { strcat(CompilerOptions, " -D BITALIGN"); applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN"); if (strstr(name, "Cedar") || strstr(name, "Redwood") || strstr(name, "Juniper") || strstr(name, "Cypress" ) || strstr(name, "Hemlock" ) || strstr(name, "Caicos" ) || strstr(name, "Turks" ) || strstr(name, "Barts" ) || strstr(name, "Cayman" ) || strstr(name, "Antilles" ) || strstr(name, "Wrestler" ) || strstr(name, "Zacate" ) || strstr(name, "WinterPark" )) patchbfi = true; } else applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN"); if (patchbfi) { strcat(CompilerOptions, " -D BFI_INT"); applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT"); } else applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch"); if (clState->goffset) strcat(CompilerOptions, " -D GOFFSET"); if (!clState->hasOpenCL11plus) strcat(CompilerOptions, " -D OCL1"); applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions); status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL); free(CompilerOptions); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_ERR, "%s", log); return NULL; } prog_built = true; #ifdef __APPLE__ /* OSX OpenCL breaks reading off binaries with >1 GPU so always build * from source. */ goto built; #endif status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status); return NULL; } status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status); return NULL; } /* The actual compiled binary ends up in a RANDOM slot! Grr, so we have * to iterate over all the binary slots and find where the real program * is. What the heck is this!? */ for (slot = 0; slot < cpnd; slot++) if (binary_sizes[slot]) break; /* copy over all of the generated binaries. */ applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, binary_sizes[slot]); if (!binary_sizes[slot]) { applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!"); return NULL; } binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1); status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status); return NULL; } /* Patch the kernel if the hardware supports BFI_INT but it needs to * be hacked in */ if (patchbfi) { unsigned remaining = binary_sizes[slot]; char *w = binaries[slot]; unsigned int start, length; /* Find 2nd incidence of .text, and copy the program's * position and length at a fixed offset from that. Then go * back and find the 2nd incidence of \x7ELF (rewind by one * from ELF) and then patch the opcocdes */ if (!advance(&w, &remaining, ".text")) goto build; w++; remaining--; if (!advance(&w, &remaining, ".text")) { /* 32 bit builds only one ELF */ w--; remaining++; } memcpy(&start, w + 285, 4); memcpy(&length, w + 289, 4); w = binaries[slot]; remaining = binary_sizes[slot]; if (!advance(&w, &remaining, "ELF")) goto build; w++; remaining--; if (!advance(&w, &remaining, "ELF")) { /* 32 bit builds only one ELF */ w--; remaining++; } w--; remaining++; w += start; remaining -= start; applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", w, remaining); patch_opcodes(w, length); status = clReleaseProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status); return NULL; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status); return NULL; } /* Program needs to be rebuilt */ prog_built = false; } free(source); /* Save the binary to be loaded next time */ binaryfile = fopen(binaryfilename, "wb"); if (!binaryfile) { /* Not a fatal problem, just means we build it again next time */ applog(LOG_DEBUG, "Unable to create file %s", binaryfilename); } else { if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) { applog(LOG_ERR, "Unable to fwrite to binaryfile"); return NULL; } fclose(binaryfile); } built: if (binaries[slot]) free(binaries[slot]); free(binaries); free(binary_sizes); applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d", filename, clState->hasBitAlign ? "" : "out", clState->vwidth, clState->wsize); if (!prog_built) { /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_ERR, "%s", log); return NULL; } } /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "search", &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status); return NULL; } #ifdef USE_SCRYPT if (opt_scrypt) { size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0)); size_t bufsize = 128 * ipt * cgpu->thread_concurrency; /* Use the max alloc value which has been rounded to a power of * 2 greater >= required amount earlier */ if (bufsize > cgpu->max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u", gpu, cgpu->max_alloc); applog(LOG_WARNING, "Your scrypt settings come to %u", bufsize); } applog(LOG_DEBUG, "Creating scrypt buffer sized %u", bufsize); clState->padbufsize = bufsize; /* This buffer is weird and might work to some degree even if * the create buffer call has apparently failed, so check if we * get anything back before we call it a failure. */ clState->padbuffer8 = NULL; clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); if (status != CL_SUCCESS && !clState->padbuffer8) { applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status); return NULL; } clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status); return NULL; } } #endif clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status); return NULL; } return clState; }
long OpenCLDevice::getLocalMemSize() { cl_ulong value; check_error(clGetDeviceInfo(my_id, CL_DEVICE_LOCAL_MEM_SIZE, (sizeof(cl_ulong)), &value, NULL)); return value; }
int QueryHWinfo(size_t *maxCmptUnits) { cl_ulong globalmemSize, localmemSize, maxConstBufSize; size_t maxWGroupSize; size_t maxWIdims; size_t maxWItemSize3D[3]; char device_str[100]; char local_plat_buf[100]; char local_dev_buf[100]; int i; // Get & Set OpenCL Platforms // get Platform numbers cl_int ret = clGetPlatformIDs(1, NULL, &numPlatforms); cl_errChk(ret,"Error 0>> clGetPlatformIDs"); printf(">> Get Platform num = %d\n\n", numPlatforms); // get memory to store platform IDs platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id)); // store IDs into memory ret = clGetPlatformIDs(numPlatforms, platforms, NULL); cl_errChk(ret,"Error 1>> clGetPlatformIDs"); // Get OpenCL Platforms & Devices Info. for (i = 0; i < numPlatforms; i++) { // Get Platform Info. ret = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(local_plat_buf), local_plat_buf, NULL); cl_errChk(ret,"Error >> clGetPlatformInfo"); // Vendor Info. printf(">> Platform #%d: Vendor => %s\n", i, local_plat_buf); // get Devices numbers ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); cl_errChk(ret,"Error >> clGetDeviceIDs"); // get memory to store device IDs Devices = (cl_device_id*)malloc(sizeof(cl_device_id)* numDevices); if (numDevices == 0) { printf("!! There is no device in platform #%d\n", i); exit(0); } else { ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, Devices, NULL); printf(">> %d Device(s) in platform #%d\n", numDevices, i); } // Get Devices info. int j = 0; // cl_device_svm_capabilities caps; for (j=0; j< numDevices; j++) { printf("\n>> [ Device: %d ]\n", j); /* // Get SVM support ret = clGetDeviceInfo(Devices[j], CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0); cl_errChk(ret,"Error >> clGetDeviceInfo_dev_svm"); printf("\t>> SVM Capabilities:\n"); if (ret == CL_SUCCESS){ if (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) printf("\t\t>> CL_DEVICE_SVM_COARSE_GRAIN_BUFFER\n"); if (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) printf("\t\t>> CL_DEVICE_SVM_FINE_GRAIN_BUFFER\n"); if (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) printf("\t\t>> CL_DEVICE_SVM_FINE_GRAIN_SYSTEM\n"); if (caps & CL_DEVICE_SVM_ATOMICS) printf("\t\t>> CL_DEVICE_SVM_ATOMICS\n"); } */ // Get Vendor info. ret = clGetDeviceInfo(Devices[j], CL_DEVICE_VENDOR, sizeof(device_str), device_str, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_dev_vendor"); printf("\t>> Vendor: %s\n", device_str); // Get Name info. ret = clGetDeviceInfo(Devices[j], CL_DEVICE_NAME, sizeof(local_dev_buf), local_dev_buf, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_dev_name"); printf("\t>> Model: %s\n", local_dev_buf); // Get Max Work Group Size ret = clGetDeviceInfo(Devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWGroupSize), &maxWGroupSize, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_maxWGroupSize"); printf("\t>> CL_DEVICE_MAX_WORK_GROUP_SIZE (WIs/WG): %d\n", (int)maxWGroupSize); // Get Max Compute Units Size ret = clGetDeviceInfo(Devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(*maxCmptUnits), maxCmptUnits, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_maxCmptUnits"); printf("\t>> CL_DEVICE_MAX_COMPUTE_UNITS : %d\n", (int)*maxCmptUnits); // Get Max WORK_ITEM_DIMENSIONS ret = clGetDeviceInfo(Devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(maxWIdims), &maxWIdims, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_maxWorkItemD"); printf("\t>> CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %d\n", (int)maxWIdims); // Get Max WORK_ITEM_SIZES ret = clGetDeviceInfo(Devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(maxWItemSize3D), &maxWItemSize3D, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_maxWItemSize3D"); printf("\t>> CL_DEVICE_MAX_WORK_ITEM_SIZES: %d, %d, %d\n", (int)maxWItemSize3D[0], (int)maxWItemSize3D[1], (int)maxWItemSize3D[2]); // Get GLOBAL_MEM_SIZE ret = clGetDeviceInfo(Devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalmemSize), &globalmemSize, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_globalmemSize"); printf("\t>> CL_DEVICE_GLOBAL_MEM_SIZE(B): %.1f\n", (float)globalmemSize); // Get MAX_CONSTANT_BUFFER_SIZE ret = clGetDeviceInfo(Devices[j], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(maxConstBufSize), &maxConstBufSize, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_maxConstBufSize"); printf("\t>> CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE(B): %.1f\n", (float)maxConstBufSize); // Get LOCAL_MEM_SIZE ret = clGetDeviceInfo(Devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localmemSize), &localmemSize, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_localmemSize"); printf("\t>> CL_DEVICE_LOCAL_MEM_SIZE(B): %.1f\n", (float)localmemSize); // Get CL_DEVICE_MAX_CLOCK_FREQUENCY ret = clGetDeviceInfo(Devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY , sizeof(localmemSize), &localmemSize, NULL); cl_errChk(ret,"Error >> clGetDeviceInfo_MAX_CLOCK_FREQUENCY"); printf("\t>> CL_DEVICE_MAX_CLOCK_FREQUENCY (MHz): %lu\n", localmemSize); } } return 0; }
long OpenCLDevice::getMaxParamSize() { cl_ulong value; check_error(clGetDeviceInfo(my_id, CL_DEVICE_MAX_PARAMETER_SIZE, (sizeof(cl_ulong)), &value, NULL)); return value; }
vx_status vxTargetInit(vx_target_t *target) { vx_status status = VX_ERROR_NO_RESOURCES; cl_int err = 0; vx_context context = target->base.context; cl_uint p, d, k; char *vx_incs = getenv("VX_CL_INCLUDE_DIR"); char *cl_dirs = getenv("VX_CL_SOURCE_DIR"); char cl_args[1024]; snprintf(cl_args, sizeof(cl_args), "-D VX_CL_KERNEL -I %s -I %s %s %s", (vx_incs?vx_incs:"C:\\Users\\Eric\\Desktop\\VS_OpenVX2\\example_multinode_graph\\cl_code"), cl_dirs, //#if !defined(__APPLE__) // "-D CL_USE_LUMINANCE", //#else "", //#endif #if defined(VX_INCLUDE_DIR) "-I "VX_INCLUDE_DIR" " #else " " #endif ); if (cl_dirs == NULL) { #ifdef VX_CL_SOURCE_DIR const char *sdir = VX_CL_SOURCE_DIR; int len = strlen(sdir); cl_dirs = malloc(len); strncpy(cl_dirs, sdir, len); #else return status; #endif } strncpy(target->name, name, VX_MAX_TARGET_NAME); target->priority = VX_TARGET_PRIORITY_OPENCL; context->num_platforms = CL_MAX_PLATFORMS; err = clGetPlatformIDs(CL_MAX_PLATFORMS, context->platforms, NULL); if (err != CL_SUCCESS) goto exit; for (p = 0; p < context->num_platforms; p++) { err = clGetDeviceIDs(context->platforms[p], CL_DEVICE_TYPE_ALL, 0, NULL, &context->num_devices[p]); err = clGetDeviceIDs(context->platforms[p], CL_DEVICE_TYPE_ALL, context->num_devices[p] > CL_MAX_DEVICES ? CL_MAX_DEVICES : context->num_devices[p], context->devices[p], NULL); if (err == CL_SUCCESS) { cl_context_properties props[] = { (cl_context_properties)CL_CONTEXT_PLATFORM, (cl_context_properties)context->platforms[p], (cl_context_properties)0, }; for (d = 0; d < context->num_devices[p]; d++) { char deviceName[64]; cl_bool compiler = CL_FALSE; cl_bool available = CL_FALSE; cl_bool image_support = CL_FALSE; err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); CL_ERROR_MSG(err, "clGetDeviceInfo"); err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_COMPILER_AVAILABLE, sizeof(cl_bool), &compiler, NULL); CL_ERROR_MSG(err, "clGetDeviceInfo"); err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); CL_ERROR_MSG(err, "clGetDeviceInfo"); err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL); CL_ERROR_MSG(err, "clGetDeviceInfo"); VX_PRINT(VX_ZONE_INFO, "Device %s (compiler=%s) (available=%s) (images=%s)\n", deviceName, (compiler?"TRUE":"FALSE"), (available?"TRUE":"FALSE"), (image_support?"TRUE":"FALSE")); } context->global[p] = clCreateContext(props, context->num_devices[p], context->devices[p], vxcl_platform_notifier, target, &err); if (err != CL_SUCCESS) break; /* check for supported formats */ if (err == CL_SUCCESS) { cl_uint f,num_entries = 0u; cl_image_format *formats = NULL; cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR; cl_mem_object_type type = CL_MEM_OBJECT_IMAGE2D; err = clGetSupportedImageFormats(context->global[p], flags, type, 0, NULL, &num_entries); formats = (cl_image_format *)malloc(num_entries * sizeof(cl_image_format)); err = clGetSupportedImageFormats(context->global[p], flags, type, num_entries, formats, NULL); for (f = 0; f < num_entries; f++) { char order[256]; char datat[256]; #define CASE_STRINGERIZE2(value, string) case value: strcpy(string, #value); break switch(formats[f].image_channel_order) { CASE_STRINGERIZE2(CL_R, order); CASE_STRINGERIZE2(CL_A, order); CASE_STRINGERIZE2(CL_RG, order); CASE_STRINGERIZE2(CL_RA, order); CASE_STRINGERIZE2(CL_RGB, order); CASE_STRINGERIZE2(CL_RGBA, order); CASE_STRINGERIZE2(CL_BGRA, order); CASE_STRINGERIZE2(CL_ARGB, order); CASE_STRINGERIZE2(CL_INTENSITY, order); CASE_STRINGERIZE2(CL_LUMINANCE, order); CASE_STRINGERIZE2(CL_Rx, order); CASE_STRINGERIZE2(CL_RGx, order); CASE_STRINGERIZE2(CL_RGBx, order); #if defined(CL_VERSION_1_2) && defined(cl_khr_gl_depth_images) CASE_STRINGERIZE2(CL_DEPTH, order); CASE_STRINGERIZE2(CL_DEPTH_STENCIL, order); #if defined(__APPLE__) CASE_STRINGERIZE2(CL_1RGB_APPLE, order); CASE_STRINGERIZE2(CL_BGR1_APPLE, order); CASE_STRINGERIZE2(CL_SFIXED14_APPLE, order); CASE_STRINGERIZE2(CL_BIASED_HALF_APPLE, order); CASE_STRINGERIZE2(CL_YCbYCr_APPLE, order); CASE_STRINGERIZE2(CL_CbYCrY_APPLE, order); CASE_STRINGERIZE2(CL_ABGR_APPLE, order); #endif #endif default: sprintf(order, "%x", formats[f].image_channel_order); break; } switch(formats[f].image_channel_data_type) { CASE_STRINGERIZE2(CL_SNORM_INT8, datat); CASE_STRINGERIZE2(CL_SNORM_INT16, datat); CASE_STRINGERIZE2(CL_UNORM_INT8, datat); CASE_STRINGERIZE2(CL_UNORM_INT16, datat); CASE_STRINGERIZE2(CL_UNORM_SHORT_565, datat); CASE_STRINGERIZE2(CL_UNORM_SHORT_555, datat); CASE_STRINGERIZE2(CL_UNORM_INT_101010, datat); CASE_STRINGERIZE2(CL_SIGNED_INT8, datat); CASE_STRINGERIZE2(CL_SIGNED_INT16, datat); CASE_STRINGERIZE2(CL_SIGNED_INT32, datat); CASE_STRINGERIZE2(CL_UNSIGNED_INT8, datat); CASE_STRINGERIZE2(CL_UNSIGNED_INT16, datat); CASE_STRINGERIZE2(CL_UNSIGNED_INT32, datat); CASE_STRINGERIZE2(CL_HALF_FLOAT, datat); CASE_STRINGERIZE2(CL_FLOAT, datat); #if defined(CL_VERSION_2_0) CASE_STRINGERIZE2(CL_UNORM_INT24, datat); #endif default: sprintf(order, "%x", formats[f].image_channel_data_type); break; } VX_PRINT(VX_ZONE_INFO, "%s : %s\n", order, datat); } } /* create a queue for each device */ for (d = 0; d < context->num_devices[p]; d++) { context->queues[p][d] = clCreateCommandQueue(context->global[p], context->devices[p][d], CL_QUEUE_PROFILING_ENABLE, &err); if (err == CL_SUCCESS) { } } char abs_source_path[VX_CL_MAX_PATH]; /* for each kernel */ for (k = 0; k < num_cl_kernels; k++) { char *sources = NULL; size_t programSze = 0; /* load the source file */ VX_PRINT(VX_ZONE_INFO, "Joiner: %s\n", FILE_JOINER); VX_PRINT(VX_ZONE_INFO, "Path: %s\n", cl_dirs); VX_PRINT(VX_ZONE_INFO, "Kernel[%u] File: %s\n", k, cl_kernels[k]->sourcepath); VX_PRINT(VX_ZONE_INFO, "Kernel[%u] Name: %s\n", k, cl_kernels[k]->kernelname); VX_PRINT(VX_ZONE_INFO, "Kernel[%u] ID: %s\n", k, cl_kernels[k]->description.name); int cl_dirs_len = strlen(cl_dirs); int sourcepath_len = strlen(cl_kernels[k]->sourcepath); strncpy(abs_source_path, cl_dirs, cl_dirs_len); strncpy(&abs_source_path[cl_dirs_len], cl_kernels[k]->sourcepath, sourcepath_len); abs_source_path[cl_dirs_len+sourcepath_len] = '\0'; sources = clLoadSources(abs_source_path, &programSze); VX_PRINT(VX_ZONE_INFO, "clLoadSources programSze:%d\n", programSze); /* create a program with this source */ cl_kernels[k]->program[p] = clCreateProgramWithSource(context->global[p], 1, (const char **)&sources, &programSze, &err); if (err == CL_SUCCESS) { err = clBuildProgram((cl_program)cl_kernels[k]->program[p], 1, (const cl_device_id *)context->devices, (const char *)cl_args, NULL, NULL); if (err != CL_SUCCESS) { CL_BUILD_MSG(err, "Build Error"); if (err == CL_BUILD_PROGRAM_FAILURE) { char log[10][1024]; size_t logSize = 0; clGetProgramBuildInfo((cl_program)cl_kernels[k]->program[p], (cl_device_id)context->devices[p][0], CL_PROGRAM_BUILD_LOG, sizeof(log), log, &logSize); VX_PRINT(VX_ZONE_ERROR, "%s", log); } } else { cl_int k2 = 0; cl_build_status bstatus = 0; size_t bs = 0; err = clGetProgramBuildInfo(cl_kernels[k]->program[p], context->devices[p][0], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &bstatus, &bs); VX_PRINT(VX_ZONE_INFO, "Status = %d (%d)\n", bstatus, err); /* get the cl_kernels from the program */ cl_kernels[k]->num_kernels[p] = 1; err = clCreateKernelsInProgram(cl_kernels[k]->program[p], 1, &cl_kernels[k]->kernels[p], NULL); VX_PRINT(VX_ZONE_INFO, "Found %u cl_kernels in %s (%d)\n", cl_kernels[k]->num_kernels[p], cl_kernels[k]->sourcepath, err); for (k2 = 0; (err == CL_SUCCESS) && (k2 < (cl_int)cl_kernels[k]->num_kernels[p]); k2++) { char kName[VX_MAX_KERNEL_NAME]; size_t size = 0; err = clGetKernelInfo(cl_kernels[k]->kernels[p], CL_KERNEL_FUNCTION_NAME, 0, NULL, &size); err = clGetKernelInfo(cl_kernels[k]->kernels[p], CL_KERNEL_FUNCTION_NAME, size, kName, NULL); VX_PRINT(VX_ZONE_INFO, "Kernel %s\n", kName); if (strncmp(kName, cl_kernels[k]->kernelname, VX_MAX_KERNEL_NAME) == 0) { vx_kernel_f kfunc = cl_kernels[k]->description.function; VX_PRINT(VX_ZONE_INFO, "Linked Kernel %s on target %s\n", cl_kernels[k]->kernelname, target->name); target->num_kernels++; target->base.context->num_kernels++; status = vxInitializeKernel(target->base.context, &target->kernels[k], cl_kernels[k]->description.enumeration, (kfunc == NULL ? vxclCallOpenCLKernel : kfunc), cl_kernels[k]->description.name, cl_kernels[k]->description.parameters, cl_kernels[k]->description.numParams, cl_kernels[k]->description.input_validate, cl_kernels[k]->description.output_validate, cl_kernels[k]->description.initialize, cl_kernels[k]->description.deinitialize); if (vxIsKernelUnique(&target->kernels[k]) == vx_true_e) { target->base.context->num_unique_kernels++; } else { VX_PRINT(VX_ZONE_KERNEL, "Kernel %s is NOT unqiue\n", target->kernels[k].name); } } } } } else { CL_ERROR_MSG(err, "Program"); } free(sources); } } } exit: if (err == CL_SUCCESS) { status = VX_SUCCESS; } else { status = VX_ERROR_NO_RESOURCES; } return status; }
// OpenCL functions int InitialiseCLEnvironment(cl_platform_id **platform, cl_device_id ***device_id, cl_program *program, renderStruct *render) { // error flag cl_int err; char infostring[1024]; char deviceInfo[1024]; // need to ensure platform supports OpenGL OpenCL interop before querying devices // to avoid segfault when calling clGetGLContextInfoKHR int *platformSupportsInterop; //get kernel from file FILE* kernelFile = fopen(kernelFileName, "rb"); fseek(kernelFile, 0, SEEK_END); long fileLength = ftell(kernelFile); rewind(kernelFile); char *kernelSource = malloc(fileLength*sizeof(char)); long read = fread(kernelSource, sizeof(char), fileLength, kernelFile); if (fileLength != read) printf("Error reading kernel file, line %d\n", __LINE__); fclose(kernelFile); //get platform and device information cl_uint numPlatforms; err = clGetPlatformIDs(0, NULL, &numPlatforms); *platform = malloc(numPlatforms * sizeof(cl_platform_id)); *device_id = malloc(numPlatforms * sizeof(cl_device_id*)); platformSupportsInterop = malloc(numPlatforms * sizeof(*platformSupportsInterop)); err |= clGetPlatformIDs(numPlatforms, *platform, NULL); CheckOpenCLError(err, __LINE__); cl_uint *numDevices; numDevices = malloc(numPlatforms * sizeof(cl_uint)); for (cl_uint i = 0; i < numPlatforms; i++) { clGetPlatformInfo((*platform)[i], CL_PLATFORM_VENDOR, sizeof(infostring), infostring, NULL); printf("\n---OpenCL: Platform Vendor %d: %s\n", i, infostring); err = clGetDeviceIDs((*platform)[i], CL_DEVICE_TYPE_ALL, 0, NULL, &(numDevices[i])); CheckOpenCLError(err, __LINE__); (*device_id)[i] = malloc(numDevices[i] * sizeof(cl_device_id)); platformSupportsInterop[i] = 0; err = clGetDeviceIDs((*platform)[i], CL_DEVICE_TYPE_ALL, numDevices[i], (*device_id)[i], NULL); CheckOpenCLError(err, __LINE__); for (cl_uint j = 0; j < numDevices[i]; j++) { char deviceName[200]; clGetDeviceInfo((*device_id)[i][j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); printf("---OpenCL: Device found %d. %s\n", j, deviceName); clGetDeviceInfo((*device_id)[i][j], CL_DEVICE_EXTENSIONS, sizeof(deviceInfo), deviceInfo, NULL); if (strstr(deviceInfo, "cl_khr_gl_sharing") != NULL) { printf("---OpenCL: cl_khr_gl_sharing supported!\n"); platformSupportsInterop[i] = 1; } else { printf("---OpenCL: cl_khr_gl_sharing NOT supported!\n"); platformSupportsInterop[i] |= 0; } if (strstr(deviceInfo, "cl_khr_fp64") != NULL) { printf("---OpenCL: cl_khr_fp64 supported!\n"); } else { printf("---OpenCL: cl_khr_fp64 NOT supported!\n"); } } } printf("\n"); //////////////////////////////// // This part is different to how we usually do things. Need to get context and device from existing // OpenGL context. Loop through all platforms looking for the device: cl_device_id device = NULL; int deviceFound = 0; cl_uint checkPlatform = 0; #ifdef TRYINTEROP while (!deviceFound) { if (platformSupportsInterop[checkPlatform]) { printf("---OpenCL: Looking for OpenGL Context device on platform %d ... ", checkPlatform); clGetGLContextInfoKHR_fn pclGetGLContextInfoKHR; PTR_FUNC_PTR pclGetGLContextInfoKHR = clGetExtensionFunctionAddressForPlatform((*platform)[checkPlatform], "clGetGLContextInfoKHR"); cl_context_properties properties[] = { CL_GL_CONTEXT_KHR, (cl_context_properties) glfwGetGLXContext(render->window), CL_GLX_DISPLAY_KHR, (cl_context_properties) glfwGetX11Display(), CL_CONTEXT_PLATFORM, (cl_context_properties) (*platform)[checkPlatform], 0}; err = pclGetGLContextInfoKHR(properties, CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR, sizeof(cl_device_id), &device, NULL); if (err != CL_SUCCESS) { printf("Not Found.\n"); checkPlatform++; if (checkPlatform > numPlatforms-1) { printf("---OpenCL: Error! Could not find OpenGL sharing device.\n"); deviceFound = 1; render->glclInterop = 0; } } else { printf("Found!\n"); deviceFound = 1; render->glclInterop = 1; } } else { checkPlatform++; } } if (render->glclInterop) { // Check the device we've found supports double precision clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(deviceInfo), deviceInfo, NULL); if (strstr(deviceInfo, "cl_khr_fp64") == NULL) { printf("---OpenCL: Interop device doesn't support double precision! We cannot use it.\n"); } else { cl_context_properties properties[] = { CL_GL_CONTEXT_KHR, (cl_context_properties) glfwGetGLXContext(render->window), CL_GLX_DISPLAY_KHR, (cl_context_properties) glfwGetX11Display(), CL_CONTEXT_PLATFORM, (cl_context_properties) (*platform)[checkPlatform], 0}; render->contextCL = clCreateContext(properties, 1, &device, NULL, 0, &err); CheckOpenCLError(err, __LINE__); } } #endif // if render->glclInterop is 0, either we are not trying to use it, we couldn't find an interop // device, or we found an interop device but it doesn't support double precision. // In these cases, have the user choose a platform and device manually. if (!(render->glclInterop)) { printf("Choose a platform and device.\n"); checkPlatform = numPlatforms; while (checkPlatform >= numPlatforms) { printf("Platform: "); scanf("%u", &checkPlatform); if (checkPlatform >= numPlatforms) { printf("Invalid Platform choice.\n"); } } cl_uint chooseDevice = numDevices[checkPlatform]; while (chooseDevice >= numDevices[checkPlatform]) { printf("Device: "); scanf("%u", &chooseDevice); if (chooseDevice >= numDevices[checkPlatform]) { printf("Invalid Device choice.\n"); } else { // Check the device we've chosen supports double precision clGetDeviceInfo((*device_id)[checkPlatform][chooseDevice], CL_DEVICE_EXTENSIONS, sizeof(deviceInfo), deviceInfo, NULL); if (strstr(deviceInfo, "cl_khr_fp64") == NULL) { printf("---OpenCL: Interop device doesn't support double precision! We cannot use it.\n"); chooseDevice = numDevices[checkPlatform]; } } } // Create non-interop context render->contextCL = clCreateContext(NULL, 1, &((*device_id)[checkPlatform][chooseDevice]), NULL, NULL, &err); device = (*device_id)[checkPlatform][chooseDevice]; } //////////////////////////////// // device is now fixed. Query its max global memory allocation size and store it, used in // HighResolutionRender routine, to determine into how many tiles we need to split the // computation. clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(render->deviceMaxAlloc), &(render->deviceMaxAlloc), NULL); printf("---OpenCL: Selected device has CL_DEVICE_MAX_MEM_ALLOC_SIZE: %lfMB\n", render->deviceMaxAlloc/1024.0/1024.0); // create a command queue render->queue = clCreateCommandQueue(render->contextCL, device, 0, &err); CheckOpenCLError(err, __LINE__); //create the program with the source above // printf("Creating CL Program...\n"); *program = clCreateProgramWithSource(render->contextCL, 1, (const char**)&kernelSource, NULL, &err); if (err != CL_SUCCESS) { printf("Error in clCreateProgramWithSource: %d, line %d.\n", err, __LINE__); return EXIT_FAILURE; } //build program executable err = clBuildProgram(*program, 0, NULL, "-I. -I src/", NULL, NULL); if (err != CL_SUCCESS) { printf("Error in clBuildProgram: %d, line %d.\n", err, __LINE__); char buffer[5000]; clGetProgramBuildInfo(*program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); printf("%s\n", buffer); return EXIT_FAILURE; } // dump ptx size_t binSize; clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binSize, NULL); unsigned char *bin = malloc(binSize); clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL); FILE *fp = fopen("openclPTX.ptx", "wb"); fwrite(bin, sizeof(char), binSize, fp); fclose(fp); free(bin); free(numDevices); free(kernelSource); printf("\n"); return EXIT_SUCCESS; }
int PGR_radiosity::prepareCL() { cl_int ciErr = CL_SUCCESS; // Get Platform cl_platform_id *cpPlatforms; cl_uint cuiPlatformsCount; ciErr = clGetPlatformIDs(0, NULL, &cuiPlatformsCount); this->CheckOpenCLError(ciErr, "clGetPlatformIDs: cuiPlatformsNum=%i", cuiPlatformsCount); cpPlatforms = (cl_platform_id*) malloc(cuiPlatformsCount * sizeof (cl_platform_id)); ciErr = clGetPlatformIDs(cuiPlatformsCount, cpPlatforms, NULL); this->CheckOpenCLError(ciErr, "clGetPlatformIDs"); cl_platform_id platform = 0; const unsigned int TMP_BUFFER_SIZE = 1024; char sTmp[TMP_BUFFER_SIZE]; for (unsigned int f0 = 0; f0 < cuiPlatformsCount; f0++) { //bool shouldBrake = false; ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_PROFILE, TMP_BUFFER_SIZE, sTmp, NULL); this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_PROFILE=%s", f0, sTmp); ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_VERSION, TMP_BUFFER_SIZE, sTmp, NULL); this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_VERSION=%s", f0, sTmp); ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_NAME, TMP_BUFFER_SIZE, sTmp, NULL); this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_NAME=%s", f0, sTmp); ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_VENDOR, TMP_BUFFER_SIZE, sTmp, NULL); this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_VENDOR=%s", f0, sTmp); //prioritize AMD and CUDA platforms if ((strcmp(sTmp, "NVIDIA Corporation") == 0)) { platform = cpPlatforms[f0]; } // if ((strcmp(sTmp, "Advanced Micro Devices, Inc.") == 0)) // { // platform = cpPlatforms[f0]; // } //prioritize Intel /*if ((strcmp(sTmp, "Intel(R) Corporation") == 0)) { platform = cpPlatforms[f0]; }*/ ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_EXTENSIONS, TMP_BUFFER_SIZE, sTmp, NULL); this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_EXTENSIONS=%s", f0, sTmp); } if (platform == 0) { //no prioritized found if (cuiPlatformsCount > 0) { platform = cpPlatforms[0]; } else { cerr << "No device was found" << endl; return -1; } } // Get Devices cl_uint cuiDevicesCount; ciErr = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &cuiDevicesCount); CheckOpenCLError(ciErr, "clGetDeviceIDs: cuiDevicesCount=%i", cuiDevicesCount); cl_device_id *cdDevices = (cl_device_id*) malloc(cuiDevicesCount * sizeof (cl_device_id)); ciErr = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, cuiDevicesCount, cdDevices, NULL); CheckOpenCLError(ciErr, "clGetDeviceIDs"); unsigned int deviceIndex = 0; for (unsigned int f0 = 0; f0 < cuiDevicesCount; f0++) { cl_device_type cdtTmp; size_t iDim[3]; ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_TYPE, sizeof (cdtTmp), &cdtTmp, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_TYPE=%s%s%s%s", f0, cdtTmp & CL_DEVICE_TYPE_CPU ? "CPU," : "", cdtTmp & CL_DEVICE_TYPE_GPU ? "GPU," : "", cdtTmp & CL_DEVICE_TYPE_ACCELERATOR ? "ACCELERATOR," : "", cdtTmp & CL_DEVICE_TYPE_DEFAULT ? "DEFAULT," : ""); if (cdtTmp & CL_DEVICE_TYPE_GPU) { //prioritize gpu if both cpu and gpu are available deviceIndex = f0; } cl_bool bTmp; ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_AVAILABLE, sizeof (bTmp), &bTmp, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_AVAILABLE=%s", f0, bTmp ? "YES" : "NO"); ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_NAME, TMP_BUFFER_SIZE, sTmp, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_NAME=%s", f0, sTmp); ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_VENDOR, TMP_BUFFER_SIZE, sTmp, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_VENDOR=%s", f0, sTmp); ciErr = clGetDeviceInfo(cdDevices[f0], CL_DRIVER_VERSION, TMP_BUFFER_SIZE, sTmp, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DRIVER_VERSION=%s", f0, sTmp); ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_PROFILE, TMP_BUFFER_SIZE, sTmp, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_PROFILE=%s", f0, sTmp); ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_VERSION, TMP_BUFFER_SIZE, sTmp, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_VERSION=%s", f0, sTmp); ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof (iDim), iDim, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_MAX_WORK_ITEM_SIZES=%ix%ix%i", f0, iDim[0], iDim[1], iDim[2]); ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (size_t), iDim, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_MAX_WORK_GROUP_SIZE=%i", f0, iDim[0]); ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_EXTENSIONS, TMP_BUFFER_SIZE, sTmp, NULL); CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_EXTENSIONS=%s", f0, sTmp); } cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; /* Create context */ this->context = clCreateContext(cps, 1, &cdDevices[deviceIndex], NULL, NULL, &ciErr); CheckOpenCLError(ciErr, "clCreateContext"); /* Create a command queue */ this->queue = clCreateCommandQueue(this->context, cdDevices[deviceIndex], CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ciErr); CheckOpenCLError(ciErr, "clCreateCommandQueue"); /* Create and compile and openCL program */ char *cSourceCL = loadProgSource("kernels.cl"); this->program = clCreateProgramWithSource(this->context, 1, (const char **) &cSourceCL, NULL, &ciErr); CheckOpenCLError(ciErr, "clCreateProgramWithSource"); free(cSourceCL); ciErr = clBuildProgram(this->program, 0, NULL, NULL, NULL, NULL); CheckOpenCLError(ciErr, "clBuildProgram"); cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(this->program, cdDevices[deviceIndex], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); CheckOpenCLError(logStatus, "clGetProgramBuildInfo."); buildLog = (char*) malloc(buildLogSize); if (buildLog == NULL) { printf("Failed to allocate host memory. (buildLog)"); return -1; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(this->program, cdDevices[deviceIndex], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); CheckOpenCLError(logStatus, "clGetProgramBuildInfo."); free(buildLog); size_t tempKernelWorkGroupSize; /* Create kernels */ this->radiosityKernel = clCreateKernel(program, "radiosity", &ciErr); CheckOpenCLError(ciErr, "clCreateKernel radiosity"); this->sortKernel = clCreateKernel(program, "sort", &ciErr); CheckOpenCLError(ciErr, "clCreateKernel sort"); this->maxWorkGroupSize = 64; this->workGroupSize = 64; ciErr = clGetKernelWorkGroupInfo(this->radiosityKernel, cdDevices[deviceIndex], CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &tempKernelWorkGroupSize, 0); CheckOpenCLError(ciErr, "clGetKernelInfo"); this->maxWorkGroupSize = MIN(tempKernelWorkGroupSize, this->maxWorkGroupSize); if (this->workGroupSize > this->maxWorkGroupSize) { cout << "Out of Resources!" << endl; cout << "Group Size specified: " << this->workGroupSize << endl; cout << "Max Group Size supported on the kernel: " << this->maxWorkGroupSize << endl; cout << "Falling back to " << this->maxWorkGroupSize << endl; this->workGroupSize = this->maxWorkGroupSize; } /* Allocate buffer of colors */ this->patchesColorsCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->model->getPatchesCount() * sizeof (cl_uchar3), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer patchesCL"); this->raw_patchesColors = new cl_uchar3[this->model->getPatchesCount()]; this->raw_patchesEnergies = new cl_float[this->model->getPatchesCount()]; this->raw_diffColors = new cl_uchar3[this->model->getPatchesCount()]; this->raw_intensities = new cl_float[this->model->getPatchesCount()]; this->model->getPatchesCL(this->raw_patchesColors, this->raw_patchesEnergies); ciErr = clEnqueueWriteBuffer(this->queue, this->patchesColorsCL, CL_TRUE, //blocking write 0, this->model->getPatchesCount() * sizeof (cl_uchar3), this->raw_patchesColors, 0, 0, 0); CheckOpenCLError(ciErr, "Copy patches colors"); /* Alocate buffer of energies */ this->patchesEnergiesCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->model->getPatchesCount() * sizeof (cl_float), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer patchesCL"); ciErr = clEnqueueWriteBuffer(this->queue, this->patchesEnergiesCL, CL_TRUE, //blocking write 0, this->model->getPatchesCount() * sizeof (cl_float), this->raw_patchesEnergies, 0, 0, 0); CheckOpenCLError(ciErr, "Copy patches"); /* Allocate buffer of patches geometry */ this->patchesGeoCL = clCreateBuffer(this->context, CL_MEM_READ_ONLY, this->model->getPatchesCount() * sizeof (cl_float8), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer patchesGeometryCL"); this->raw_patchesGeo = new cl_float8[this->model->getPatchesCount()]; this->model->getPatchesGeometryCL(raw_patchesGeo); ciErr = clEnqueueWriteBuffer(this->queue, this->patchesGeoCL, CL_TRUE, //blocking write 0, this->model->getPatchesCount() * sizeof (cl_float8), this->raw_patchesGeo, 0, 0, 0); CheckOpenCLError(ciErr, "Copy patches geometry"); this->indicesCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->maxWorkGroupSize * sizeof (cl_uint), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer indicesCL"); this->indicesCountCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, sizeof (cl_uint), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer indicesCountCL"); this->maximalEnergyCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, sizeof (cl_float), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer maximalEnergyCL"); this->diffColorsCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->model->getPatchesCount() * sizeof (cl_uchar3), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer diffColorsCL"); cl_uchar3* zeros = new cl_uchar3[this->model->getPatchesCount()]; memset(zeros, 0, this->model->getPatchesCount() * sizeof (cl_uchar3)); ciErr = clEnqueueWriteBuffer(this->queue, this->diffColorsCL, CL_TRUE, //blocking write 0, this->model->getPatchesCount() * sizeof (cl_uchar3), zeros, 0, 0, 0); CheckOpenCLError(ciErr, "Clear diff colors"); delete [] zeros; this->intensitiesCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->model->getPatchesCount() * sizeof (cl_float), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer intensitiesCL"); cl_float* zeroIntensity = new cl_float[this->model->getPatchesCount()]; memset(zeroIntensity, 0, this->model->getPatchesCount() * sizeof (cl_float)); ciErr = clEnqueueWriteBuffer(this->queue, this->intensitiesCL, CL_TRUE, //blocking write 0, this->model->getPatchesCount() * sizeof (cl_float), zeroIntensity, 0, 0, 0); CheckOpenCLError(ciErr, "Clear intensities"); delete [] zeroIntensity; this->texturesCL = clCreateBuffer(this->context, CL_MEM_READ_ONLY, this->maxWorkGroupSize * 768 * 256 * sizeof (cl_uchar3), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer texturesCL"); this->visitedCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->maxWorkGroupSize * this->model->getPatchesCount() * sizeof (cl_bool), 0, &ciErr); CheckOpenCLError(ciErr, "CreateBuffer visitedCL"); cl_bool* zeroVisited = new cl_bool[this->maxWorkGroupSize * this->model->getPatchesCount()]; memset(zeroVisited, 0, this->maxWorkGroupSize * this->model->getPatchesCount() * sizeof (cl_bool)); ciErr = clEnqueueWriteBuffer(this->queue, this->visitedCL, CL_TRUE, //blocking write 0, this->model->getPatchesCount() * sizeof (cl_bool), zeroVisited, 0, 0, 0); CheckOpenCLError(ciErr, "Clear visited flags"); delete [] zeroVisited; free(cdDevices); return 0; }
int main(int argc, char **argv) { cl_uint num; cl_int err; int platform_idx = -1; cl_platform_id *plat_ids; int i; size_t sz; cl_device_id *gpu_devs; cl_context_properties cps[3]; cl_context context; int opt; char *input; int run_size = 1024; struct AIISA_Program prog; cl_command_queue queue; int ei; int nloop = 16; struct AIISA_CodeBuffer buf; aiisa_code_buffer_init(&buf); clGetPlatformIDs(0, NULL, &num); plat_ids = (cl_platform_id*)malloc(sizeof(*plat_ids) * num); clGetPlatformIDs(num, plat_ids, NULL); while ((opt = getopt(argc, argv, "n:")) != -1) { switch (opt) { case 'n': run_size = atoi(optarg); break; default: puts("usage : run in.cl"); return 1; } } if (optind >= argc) { puts("usage : run in.cl"); return 1; } input = argv[optind]; for (i=0; i<(int)num; i++) { char name[1024]; size_t len; clGetPlatformInfo(plat_ids[i], CL_PLATFORM_VENDOR, sizeof(name), name, &len); //puts(name); if (strcmp(name, "Advanced Micro Devices, Inc.") == 0) { platform_idx = i; break; } } if (platform_idx == -1) { puts("no amd"); return -1; } clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &num); if (num == 0) { puts("no gpu"); return -1; } gpu_devs = (cl_device_id*)malloc(sizeof(gpu_devs[0]) * 1); //clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, num, gpu_devs, NULL); cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)plat_ids[platform_idx]; cps[2] = 0; context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(gpu_devs), gpu_devs, &sz); queue = clCreateCommandQueue(context, gpu_devs[0], 0, NULL); { char name[1024]; size_t sz; clGetDeviceInfo(gpu_devs[0], CL_DEVICE_NAME, sizeof(name), name, &sz); puts(name); } //puts(input); aiisa_build_binary_from_cl(&prog, context, gpu_devs[0], input); for (ei=0; ei<nloop; ei++) { cl_program cl_prog; const unsigned char *bin[1]; size_t bin_size[1]; cl_kernel ker; cl_mem in, out; size_t global_size[3]; double tb, te; tb = sec(); gen_code(&prog, &buf); bin[0] = prog.cl_binary; bin_size[0] = prog.size; cl_prog = clCreateProgramWithBinary(context, 1, gpu_devs, bin_size, bin, NULL, NULL); clBuildProgram(cl_prog, 1, gpu_devs, NULL, NULL, NULL); ker = clCreateKernel(cl_prog, "f", &err); te = sec(); printf("build : %f[usec]\n", (te-tb)*1000000); in = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err); out = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err); clSetKernelArg(ker, 0, sizeof(cl_mem), &in); clSetKernelArg(ker, 1, sizeof(cl_mem), &out); { int *ptr = (int*)clEnqueueMapBuffer(queue, in, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { ptr[i] = i; } clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL); } { int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { ptr[i] = 0xdeadbeef; } clEnqueueUnmapMemObject(queue, out, ptr, 0, NULL, NULL); } err = clFinish(queue); global_size[0] = run_size; err = clEnqueueNDRangeKernel(queue, ker, 1, NULL, global_size, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { puts("enqueue nd"); } err = clFinish(queue); if (err != CL_SUCCESS) { puts("fini"); } if (ei == 0) { int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_READ, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { printf("%d : %x\n", i, ptr[i]); } clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL); } err = clFinish(queue); clReleaseMemObject(in); clReleaseMemObject(out); clReleaseKernel(ker); clReleaseProgram(cl_prog); } return 0; }
/** * \brief Creates and initializes the working data for the plan * \param plan The Plan struct that holds the plan's data values. * \return Error flag value */ int initOPENCL_MEMPlan(void *plan){ // <- Replace YOUR_NAME with the name of your module. if(!plan){ return make_error(ALLOC, generic_err); // <- This is the error code for one of the malloc fails. } Plan *p; OPENCL_MEM_DATA *d; p = (Plan *)plan; #ifdef HAVE_PAPI int temp_event, i; int PAPI_Events [NUM_PAPI_EVENTS] = PAPI_COUNTERS; char *PAPI_units [NUM_PAPI_EVENTS] = PAPI_UNITS; #endif //HAVE_PAPI if(p){ d = (OPENCL_MEM_DATA *)p->vptr; p->exec_count = 0; // Initialize the plan execution count to zero. perftimer_init(&p->timers, NUM_TIMERS); // Initialize all performance timers to zero. #ifdef HAVE_PAPI /* Initialize plan's PAPI data */ p->PAPI_EventSet = PAPI_NULL; p->PAPI_Num_Events = 0; TEST_PAPI(PAPI_create_eventset(&p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME); //Add the desired events to the Event Set; ensure the dsired counters // are on the system then add, ignore otherwise for(i = 0; i < TOTAL_PAPI_EVENTS && i < NUM_PAPI_EVENTS; i++){ temp_event = PAPI_Events[i]; if(PAPI_query_event(temp_event) == PAPI_OK){ p->PAPI_Num_Events++; TEST_PAPI(PAPI_add_event(p->PAPI_EventSet, temp_event), PAPI_OK, MyRank, 9999, PRINT_SOME); } } PAPIRes_init(p->PAPI_Results, p->PAPI_Times); PAPI_set_units(p->name, PAPI_units, NUM_PAPI_EVENTS); TEST_PAPI(PAPI_start(p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME); #endif //HAVE_PAPI } if(d){ cl_int error; pthread_mutex_lock(&opencl_platform_mutex); error = clGetPlatformIDs(0, NULL,&(d->num_platforms)); pthread_mutex_unlock(&opencl_platform_mutex); assert(error == CL_SUCCESS); d->platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * d->num_platforms); pthread_mutex_lock(&opencl_platform_mutex); error = clGetPlatformIDs(d->num_platforms, d->platforms, NULL); pthread_mutex_unlock(&opencl_platform_mutex); assert(error == CL_SUCCESS); error = clGetDeviceIDs(d->platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &(d->num_devices)); assert(error == CL_SUCCESS); d->devices = (cl_device_id *)malloc(sizeof(cl_device_id) * d->num_devices); error = clGetDeviceIDs(d->platforms[0],CL_DEVICE_TYPE_ALL, d->num_devices, d->devices, NULL); assert(error == CL_SUCCESS); d->context = clCreateContext(NULL, 1, &(d->devices[d->device_id]), NULL, NULL, &error); assert(error == CL_SUCCESS); d->opencl_queue = clCreateCommandQueue(d->context, d->devices[d->device_id], 0, &error); assert(error == CL_SUCCESS); error = clGetDeviceInfo(d->devices[d->device_id], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &(d->device_memory), NULL); assert(error == CL_SUCCESS); d->device_memory -= SUB_FACTOR; d->buffer = clCreateBuffer(d->context, CL_MEM_WRITE_ONLY, d->device_memory, NULL, &error); assert(error == CL_SUCCESS); size_t page_size = sysconf(_SC_PAGESIZE); error = posix_memalign((void **)&(d->return_buffer), page_size, d->device_memory); assert(error == 0); d->program = clCreateProgramWithSource(d->context, 1, (const char **)&opencl_program,NULL,&error); assert(error == CL_SUCCESS); error = clBuildProgram(d->program,1,&(d->devices[d->device_id]),NULL,NULL,NULL); assert(error == CL_SUCCESS); d->kernel = clCreateKernel(d->program, "write_pattern", &error); assert(error == CL_SUCCESS); } return ERR_CLEAN; // <- This indicates a clean run with no errors. Does not need to be changed. } /* initOPENCL_MEMPlan */
int main(int argc, char** argv) { printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE); cl_int error; cl_uint num_platforms; // Get the number of platforms error = clGetPlatformIDs(0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of platforms cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs(num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Print the chosen platform (if there are multiple platforms, choose the first one) cl_platform_id platform = platforms[0]; char pbuf[100]; error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); // Create a GPU context cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get and print the chosen device (if there are multiple devices, choose the first one) size_t devices_size; error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); device = devices[0]; error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); // Create a command queue command_queue = clCreateCommandQueue(context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); int size; int grid_rows,grid_cols = 0; float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; char *tfile, *pfile, *ofile; int total_iterations = 60; int pyramid_height = 1; // number of iterations if (argc < 7) usage(argc, argv); if((grid_rows = atoi(argv[1]))<=0|| (grid_cols = atoi(argv[1]))<=0|| (pyramid_height = atoi(argv[2]))<=0|| (total_iterations = atoi(argv[3]))<=0) usage(argc, argv); tfile=argv[4]; pfile=argv[5]; ofile=argv[6]; size=grid_rows*grid_cols; // --------------- pyramid parameters --------------- int borderCols = (pyramid_height)*EXPAND_RATE/2; int borderRows = (pyramid_height)*EXPAND_RATE/2; int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1); int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1); FilesavingTemp = (float *) malloc(size*sizeof(float)); FilesavingPower = (float *) malloc(size*sizeof(float)); // MatrixOut = (float *) calloc (size, sizeof(float)); if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut) fatal("unable to allocate memory"); // Read input data from disk readinput(FilesavingTemp, grid_rows, grid_cols, tfile); readinput(FilesavingPower, grid_rows, grid_cols, pfile); // Load kernel source from file const char *source = load_kernel_source("hotspot_kernel.cl"); size_t sourceSize = strlen(source); // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[110]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions," "); #ifdef BLOCK_SIZE sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE); #endif // Create an executable from the kernel error = clBuildProgram(program, 1, &device, clOptions, NULL, NULL); // Show compiler warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); kernel = clCreateKernel(program, "hotspot", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long start_time = get_time(); // Create two temperature matrices and copy the temperature input data cl_mem MatrixTemp[2]; // Create input memory buffers on device MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Lingjie Zhang modifited at Nov 1, 2015 //MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error); MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE , sizeof(float) * size, NULL, &error); // end Lingjie Zhang modification if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Copy the power input data cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Perform the computation int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height, blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower); // Copy final temperature data back cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long end_time = get_time(); printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); // Write final output to output file writeoutput(MatrixOut, grid_rows, grid_cols, ofile); error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); clReleaseMemObject(MatrixTemp[0]); clReleaseMemObject(MatrixTemp[1]); clReleaseMemObject(MatrixPower); clReleaseContext(context); return 0; }
int main(int argc, char** argv) { // beginning of the verbose OpenCL allocation cl_platform_id platform_id = NULL; cl_uint ret_num_platforms = 0; cl_uint ret_num_devices = 0; cl_int ret = 0; // the output from opencl kernel float *c_inputs = malloc(ARRAY_SIZE*sizeof(float)); float *c_outputs = malloc(ARRAY_SIZE*sizeof(float)); cl_float *cl_inputs = malloc(ARRAY_SIZE*sizeof(cl_float)); cl_float *cl_outputs = malloc(ARRAY_SIZE*sizeof(cl_float)); // get random numbers via Rmath set_seed(atoi(argv[1]), 197414); float tmp_in = 0.0; #pragma omp parallel for for (long i = 0; i < ARRAY_SIZE; i++) { tmp_in = rnorm(0, 1); c_inputs[i] = tmp_in; cl_inputs[i] = (cl_float) tmp_in; } // measure time elapse clock_t start = clock(); #pragma omp parallel for for (long i = 0; i < ARRAY_SIZE; i++) { c_outputs[i] = expf(c_inputs[i]); } printf("CPU time for %d exp operation: %d\n", ARRAY_SIZE, (int) (clock() - start)); // read kernel source FILE *fp; char filename[] = "./hello_log.cl"; char *source_str; size_t source_size; fp = fopen(filename, "r"); source_str = (char*) malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // get platform and device info ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); cl_device_id device_ids[2]; ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices); printf("Number of devices: %5d\n", ret_num_devices); // print device name char bdname[100]; clGetDeviceInfo(device_ids[1], CL_DEVICE_NAME, 100, bdname, NULL); printf("Used device: %s\n", bdname); // use second GPU cl_device_id device_id = device_ids[1]; // create opencl context cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // create command queue cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret); // create memory buffer for input cl_mem memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_SIZE*sizeof(cl_float), NULL, &ret); // create memory buffer for output cl_mem memobj_out = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_SIZE*sizeof(cl_float), NULL, &ret); // create kernel program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // build program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); printf("build program successfully\n"); // create opencl kernel cl_kernel kernel = clCreateKernel(program, "hello_exp", &ret); // set opencl parameters for inputs ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj_in); // set opencl parameters for inputs ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj_out); // execute opencl kernel size_t global_item_size = ARRAY_SIZE/32; size_t local_item_size = 32; // measure time start = clock(); ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, ARRAY_SIZE*sizeof(cl_float), cl_inputs, 0, NULL, NULL); // run it ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // copy results from the memory buffer ret = clEnqueueReadBuffer(command_queue, memobj_out, CL_TRUE, 0, ARRAY_SIZE*sizeof(cl_float), cl_outputs, 0, NULL, NULL); printf("GPU time (with PCI-E overhead): %d\n", (int) (clock() - start)); printf("inputs: %3.7f %3.7f\n", c_inputs[150000], cl_inputs[150000]); printf("outputs: %3.7f %3.7f\n", c_outputs[150000], (float) cl_outputs[150000]); // finalization ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj_in); ret = clReleaseMemObject(memobj_out); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(source_str); return 0; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'tgamma_float4.cl' */ source_code = read_buffer("tgamma_float4.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "tgamma_float4", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_float4 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_float4)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_float4){{2.0, 2.0, 2.0, 2.0}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float4), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_float4 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_float4)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float4)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float4), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float4)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
long OpenCLDevice::getMaxWorkGroupSize() { cl_ulong value; check_error(clGetDeviceInfo(my_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, (sizeof(cl_ulong)), &value, NULL)); return value; }
int main(int argc, char **argv) { cl_int ret; /* * Command line */ char *source_path; if (argc != 2) { printf("syntax: %s <kernel-source>\n", argv[0]); exit(1); } source_path = argv[1]; /* * Platform */ /* Get platform */ cl_platform_id platform; cl_uint num_platforms; ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: second call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n", platform_name); printf("\n"); /* * Device */ /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* * Context */ /* Create context */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } /* * Command Queue */ /* Create command queue */ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("\n"); /* * Program */ /* Program source */ const char *source; size_t source_length; /* Read binary */ source = read_buffer(source_path, &source_length); if (!source) { printf("error: %s: cannot open kernel source\n", source_path); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, &source, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* * Kernel */ /* Create kernel */ cl_kernel kernel; kernel = clCreateKernel(program, "vector_add", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } printf("\n"); /* * Buffers */ /* Create and allocate host buffers */ size_t num_elem = 10; cl_int *src1_host_buffer; cl_int *src2_host_buffer; cl_int *dst_host_buffer; src1_host_buffer = malloc(num_elem * sizeof(cl_int)); src2_host_buffer = malloc(num_elem * sizeof(cl_int)); dst_host_buffer = malloc(num_elem * sizeof(cl_int)); /* Initialize host source buffer */ int i; for (i = 0; i < num_elem; i++) { src1_host_buffer[i] = i; src2_host_buffer[i] = 100; } /* Create device source buffers */ cl_mem src1_device_buffer; cl_mem src2_device_buffer; src1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL); src2_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL); if (!src1_device_buffer || !src2_device_buffer) { printf("error: could not create destination buffer\n"); exit(1); } /* Create device destination buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem * sizeof(cl_int), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create destination buffer\n"); exit(1); } /* Copy buffer */ ret = clEnqueueWriteBuffer(command_queue, src1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), src1_host_buffer, 0, NULL, NULL); ret |= clEnqueueWriteBuffer(command_queue, src2_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), src2_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* * Kernel arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src1_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src2_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* * Launch Kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; /* Launch the kernel */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* * Result */ /* Receive buffer */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Print result */ for (i = 0; i < num_elem; i++) printf("dst_host_buffer[%d] = %d\n", i, dst_host_buffer[i]); printf("\n"); return 0; }
long OpenCLDevice::getMaxMemAllocSize() { cl_ulong value; check_error(clGetDeviceInfo(my_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, (sizeof(cl_ulong)), &value, NULL)); return value; }
/* Find a GPU or CPU associated with the first available platform */ void CLHash_Utilities_CreateContext_p(cl_context *context, cl_command_queue *command_queue, const char *file , int line) { uint num_platforms; cl_platform_id *platforms; cl_device_id device; int err; /* Get all available platforms */ err = clGetPlatformIDs(0, NULL, &num_platforms); CLHash_Utilities_HandleError(err, "CLHash_Utilities_CreateContext", "clGetPlatformIDs"); platforms = (cl_platform_id *)malloc(num_platforms*sizeof(cl_platform_id)); /* Identify a platform */ err = clGetPlatformIDs(num_platforms, platforms, NULL); CLHash_Utilities_HandleError(err, "CLHash_Utilities_CreateContext", "clGetPlatformIDs"); if (DEBUG == 1) { char info[1024]; for (int iplatform=0; iplatform<num_platforms; iplatform++){ printf(" Platform %d:\n",iplatform+1); //clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_PROFILE, 1024L,info,0); //printf(" CL_PLATFORM_PROFILE : %s\n",info); clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_VERSION, 1024L,info,0); printf(" CL_PLATFORM_VERSION : %s\n",info); clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_NAME, 1024L,info,0); printf(" CL_PLATFORM_NAME : %s\n",info); clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_VENDOR, 1024L,info,0); printf(" CL_PLATFORM_VENDOR : %s\n",info); //clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_EXTENSIONS,1024L,info,0); // printf(" CL_PLATFORM_EXTENSIONS : %s\n",info); } } /* Access a device */ for (int iplatform=0; iplatform<num_platforms; iplatform++){ err = clGetDeviceIDs(platforms[iplatform], CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(err != CL_DEVICE_NOT_FOUND){ break; } } if(err == CL_DEVICE_NOT_FOUND){ for (int iplatform=0; iplatform<num_platforms; iplatform++){ err = clGetDeviceIDs(platforms[iplatform], CL_DEVICE_TYPE_ACCELERATOR, 1, &device, NULL); if(err != CL_DEVICE_NOT_FOUND){ break; } } if(err == CL_DEVICE_NOT_FOUND){ for (int iplatform=0; iplatform<num_platforms; iplatform++){ err = clGetDeviceIDs(platforms[iplatform], CL_DEVICE_TYPE_CPU, 1, &device, NULL); if(err != CL_DEVICE_NOT_FOUND){ break; } } } } CLHash_Utilities_HandleError(err, "CLHash_Utilities_CreateContext", "clGetDeviceIDs"); if (DEBUG == 1) { char info[1024]; printf("\n\n"); printf(" Device:\n"); clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(info), info, NULL); printf(" CL_DEVICE_NAME : %s\n",info); clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(info), info, NULL); printf(" CL_DEVICE_VENDOR : %s\n",info); } *context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err != CL_SUCCESS) CLHash_Utilities_PrintError_p(err, "CLHash_Utilities_CreateContext", "clCreateContext", file, line); *command_queue = clCreateCommandQueue(*context, device, CL_QUEUE_PROFILING_ENABLE, &err); if(err != CL_SUCCESS) CLHash_Utilities_PrintError_p(err, "CLHash_Utilities_CreateContext", "clCreateCommandQueue", file, line); free(platforms); }
int main () { int err, i; cl_platform_id platform; cl_device_id device; cl_context context; cl_context_properties context_props[3]; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem buffer; size_t len; const char *program_source = NULL; char *device_extensions = NULL; char kernel_build_opts[256]; size_t size = sizeof (cl_int) * SIZE; const size_t global_work_size[] = {SIZE, 0, 0}; /* size of each dimension */ cl_int *data; /* In order to see which devices the OpenCL implementation on your platform provides you may issue a call to the print_clinfo () fuction. */ /* Initialize the data the OpenCl program operates on. */ data = (cl_int*) calloc (1, size); if (data == NULL) { fprintf (stderr, "calloc failed\n"); exit (EXIT_FAILURE); } /* Pick the first platform. */ CHK (clGetPlatformIDs (1, &platform, NULL)); /* Get the default device and create context. */ CHK (clGetDeviceIDs (platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL)); context_props[0] = CL_CONTEXT_PLATFORM; context_props[1] = (cl_context_properties) platform; context_props[2] = 0; context = clCreateContext (context_props, 1, &device, NULL, NULL, &err); CHK_ERR ("clCreateContext", err); queue = clCreateCommandQueue (context, device, 0, &err); CHK_ERR ("clCreateCommandQueue", err); /* Query OpenCL extensions of that device. */ CHK (clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, 0, NULL, &len)); device_extensions = (char *) malloc (len); CHK (clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, len, device_extensions, NULL)); strcpy (kernel_build_opts, "-Werror -cl-opt-disable"); if (strstr (device_extensions, "cl_khr_fp64") != NULL) strcpy (kernel_build_opts + strlen (kernel_build_opts), " -D HAVE_cl_khr_fp64"); if (strstr (device_extensions, "cl_khr_fp16") != NULL) strcpy (kernel_build_opts + strlen (kernel_build_opts), " -D HAVE_cl_khr_fp16"); /* Read the OpenCL kernel source into the main memory. */ program_source = read_file (STRINGIFY (CL_SOURCE), &len); if (program_source == NULL) { fprintf (stderr, "file does not exist: %s\n", STRINGIFY (CL_SOURCE)); exit (EXIT_FAILURE); } /* Build the OpenCL kernel. */ program = clCreateProgramWithSource (context, 1, &program_source, &len, &err); free ((void*) program_source); CHK_ERR ("clCreateProgramWithSource", err); err = clBuildProgram (program, 0, NULL, kernel_build_opts, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char *clbuild_log = NULL; CHK (clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)); clbuild_log = malloc (len); if (clbuild_log) { CHK (clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, len, clbuild_log, NULL)); fprintf (stderr, "clBuildProgram failed with:\n%s\n", clbuild_log); free (clbuild_log); } exit (EXIT_FAILURE); } /* In some cases it might be handy to save the OpenCL program binaries to do further analysis on them. In order to do so you may call the following function: save_program_binaries (program);. */ kernel = clCreateKernel (program, "testkernel", &err); CHK_ERR ("clCreateKernel", err); /* Setup the input data for the kernel. */ buffer = clCreateBuffer (context, CL_MEM_USE_HOST_PTR, size, data, &err); CHK_ERR ("clCreateBuffer", err); /* Execute the kernel (data parallel). */ CHK (clSetKernelArg (kernel, 0, sizeof (buffer), &buffer)); CHK (clEnqueueNDRangeKernel (queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL)); /* Fetch the results (blocking). */ CHK (clEnqueueReadBuffer (queue, buffer, CL_TRUE, 0, size, data, 0, NULL, NULL)); /* Compare the results. */ for (i = 0; i < SIZE; i++) { if (data[i] != 0x1) { fprintf (stderr, "error: data[%d]: %d != 0x1\n", i, data[i]); exit (EXIT_FAILURE); } } /* Cleanup. */ CHK (clReleaseMemObject (buffer)); CHK (clReleaseKernel (kernel)); CHK (clReleaseProgram (program)); CHK (clReleaseCommandQueue (queue)); CHK (clReleaseContext (context)); free (data); return 0; }