void imgdiff(size_t N, size_t width, size_t height, double* diff_matrix, unsigned char* images) { //// we need to fill in //// cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_GPU; cl_device_id *devs; cl_context context; cl_command_queue *cmd_queues; cl_program program; cl_kernel *kernels; cl_uint num_platforms; cl_uint num_devs; cl_mem* m_image1; cl_mem* m_image2; cl_mem* m_result; cl_event* ev_kernels; int err = CL_SUCCESS; int i, j, k; // modify version err = clGetPlatformIDs(0, NULL, &num_platforms); if(err != CL_SUCCESS) { printf("Error: platform error\n"); return 0; } if(num_platforms == 0) { printf("Error: platform no count\n"); return 0; } platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); if(err != CL_SUCCESS) { printf("Error: clGetPlatformIDs error\n"); return 0; } for(i = 0; i<num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs); if(err != CL_SUCCESS) { printf("Error: clGetDevice\n"); return 0; } if(num_devs >= 1) { devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs); clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL); break; } } context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err); if(err != CL_SUCCESS) { printf("Error: clCreateContext error\n"); return 0; } char* source = NULL; size_t src_size = 0; err = ReadSourceFromFile("./imgdiff_cal.cl", &source, &src_size); if (CL_SUCCESS != err) { printf("Error: ReadSourceFromFile returned %s.\n", err); free(source); return 0; } program = clCreateProgramWithSource(context, 1, (const char**)&source, &src_size, &err); if(err != CL_SUCCESS) { printf("Error: clCreateProgram error\n"); return 0; } free(source); printf("Create Program Success\n"); #if DBG // Measure clBuildProgram -@henry added gettimeofday(&start_m, NULL ); #endif err = clBuildProgram(program, num_devs, devs, "", NULL, NULL); #if DBG gettimeofday(&end_m, NULL ); double time = (end_m.tv_usec - start_m.tv_usec)*1e-6 + (end_m.tv_sec - start_m.tv_sec); printf("[Debug] Elapsed Time of clBuildProgram() : %lf s\n",time); #endif if(err != CL_SUCCESS) { printf("Error: clBuildProgram\n"); return 0; } printf("Build Program Success\n"); kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs); for(i = 0; i<num_devs; i++) { kernels[i] = clCreateKernel(program, "imgdiff_cal", NULL); } printf("Create Kernel Success\n"); cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs); for(i=0; i<num_devs; i++) { cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err); if(err != CL_SUCCESS) { printf("Error: clCreateCommandQueue error\n"); return 0; } } printf("Create commandQueue Success\n"); int LOCAL_WIDTH = 16; int LOCAL_HEIGHT = 16; int WORK_WIDTH = ceil((double)width / LOCAL_WIDTH)*LOCAL_WIDTH; int WORK_HEIGHT = ceil((double)height/LOCAL_HEIGHT) * LOCAL_HEIGHT; int WORK_AMOUNT = width * height; int WORK_GROUP_COUNT = ceil(((double)WORK_WIDTH * WORK_HEIGHT) / (LOCAL_WIDTH * LOCAL_HEIGHT)); int WORK_GROUP_WIDTH = width; int WORK_GROUP_HEIGHT = height; int SAMPLE_COUNT = 16; int WORK_COUNT[num_devs]; double tmp_result_data[WORK_GROUP_COUNT*SAMPLE_COUNT]; printf("WORK_WIDTH %d\tWORK_HEIGHT %d\t WORK_AMOUNT %d\t WORK_GROUP_COUNT %d\n", WORK_WIDTH, WORK_HEIGHT, WORK_AMOUNT, WORK_GROUP_COUNT); m_image1 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); m_image2 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); m_result = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); for(i=0; i<num_devs; i++) { m_image1[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT * 3, NULL, NULL); m_image2[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT*SAMPLE_COUNT * 3, NULL, NULL); m_result[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double) * WORK_GROUP_COUNT * SAMPLE_COUNT, NULL, NULL); clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*)&m_image1[i]); clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*)&m_image2[i]); clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*)&m_result[i]); clSetKernelArg(kernels[i], 3, sizeof(cl_int), &WORK_GROUP_WIDTH); clSetKernelArg(kernels[i], 4, sizeof(cl_int), &WORK_GROUP_HEIGHT); } ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs); int row, col; row = 0; col = 1; for(row = 0; row < N; row++) { if( (N-row-1) < (SAMPLE_COUNT*4) && SAMPLE_COUNT > 1) SAMPLE_COUNT = SAMPLE_COUNT / 2; int remain_count = N - (row + 1); for(i=0; i<num_devs; i++) { clEnqueueWriteBuffer(cmd_queues[i], m_image1[i], CL_FALSE, 0, sizeof(unsigned char) * WORK_AMOUNT * 3, (void*)(images + (row * width*height)*3), 0, NULL, NULL); } diff_matrix[row*N + row] = 0; col = row + 1; while( col< N) { size_t lws[2] = { LOCAL_WIDTH, LOCAL_HEIGHT }; size_t gws[2] = { WORK_WIDTH, WORK_HEIGHT}; for(i=0; i<num_devs; i++) { if((remain_count - SAMPLE_COUNT) < 0) { WORK_COUNT[i] = remain_count; remain_count = 0; } else { WORK_COUNT[i] = SAMPLE_COUNT; remain_count = remain_count - SAMPLE_COUNT; } if(WORK_COUNT[i] != 0) { clSetKernelArg(kernels[i], 5, sizeof(cl_int), &WORK_COUNT[i]); int offset = 0; for(j=0; j<i; j++) offset += WORK_COUNT[j]; err = clEnqueueWriteBuffer(cmd_queues[i], m_image2[i], CL_FALSE, 0, sizeof(unsigned char)*WORK_AMOUNT*WORK_COUNT[i]*3, (void*)(images +((col * width*height) + (WORK_AMOUNT * offset))*3), 0, NULL, NULL); } } for( i=0; i < num_devs; i++ ) { if(WORK_COUNT[i] != 0) { err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 2, NULL, gws, lws, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Error: clEnqueueNDRangeKernel %d error\n", i); printf("%s\n", TranslateOpenCLError(err)); return 0; } } } double tmp_sum = 0; i = 0; for( i = num_devs -1; i >= 0; i-- ) { if(WORK_COUNT[i] != 0) { err = clEnqueueReadBuffer( cmd_queues[i], m_result[i], CL_TRUE, 0, sizeof(double) * WORK_GROUP_COUNT * WORK_COUNT[i], tmp_result_data, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Error: clEnqueueReadBuffer%d error\n", i); return 0; } //printf("receive......"); for(j = 0; j<WORK_COUNT[i]; j++) { tmp_sum = 0; for(k = 0; k<WORK_GROUP_COUNT; k++) { tmp_sum += tmp_result_data[k + j*WORK_GROUP_COUNT]; //printf("%lf\t", tmp_result_data[k+j*WORK_GROUP_COUNT]); } //printf("%lf %lf\n", tmp_sum, tmp_result_data[j*WORK_GROUP_COUNT]); int offset = 0; for(k=0; k<i; k++) offset += WORK_COUNT[k]; diff_matrix[row*N+col+j+offset] = diff_matrix[(col+j+offset)*N+row] = tmp_sum; } } } for( i = 0; i < num_devs; i++ ) { col += WORK_COUNT[i]; } } } }
// SETUP int CLContext::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; int gpu = 1; if(gpu == 0) dType = CL_DEVICE_TYPE_CPU; else //deviceType = "gpu" dType = CL_DEVICE_TYPE_GPU; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. <----- LOL check out the amd propaganda */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return CL_FAILURE; if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return CL_FAILURE; for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) return CL_FAILURE; platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) break; } delete[] platforms; } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; /* Use NULL for backward compatibility */ cl_context_properties* cprops = (NULL == platform) ? NULL : cps; context = clCreateContextFromType( cprops, dType, NULL, NULL, &status); if(!checkVal( status, CL_SUCCESS, "clCreateContextFromType failed.")) return CL_FAILURE; size_t deviceListSize; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return CL_FAILURE; /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices==NULL) { cout << "Failed to allocate memory (devices)." << endl; return CL_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return CL_FAILURE; /* Create command queue */ commandQueue = clCreateCommandQueue( context, devices[0], 0, &status); if(!checkVal( status, CL_SUCCESS, "clCreateCommandQueue failed.")) return CL_FAILURE; /* Get Device specific Information */ status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(!checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed.")) return CL_FAILURE; status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDimensions, NULL); if(!checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) return CL_FAILURE; maxWorkItemSizes = (size_t *)malloc(maxDimensions * sizeof(unsigned int)); status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, (void*)maxWorkItemSizes, NULL); if(!checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) return CL_FAILURE; status = clGetDeviceInfo( devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void *)&totalLocalMemory, NULL); if(!checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed.")) return CL_FAILURE; /* * Create and initialize memory objects */ /* create a CL program using the kernel source */ string content; fileH.open( "critterding.cl", content ); const char * source = content.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(!checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return CL_FAILURE; /* create a cl program executable for all the devices specified */ status = clBuildProgram( program, 1, &devices[0], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return CL_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { cout << "Failed to allocate host memory. (buildLog)" << endl; return CL_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return CL_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return CL_FAILURE; } return CL_SUCCESS; }
void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_left(int xdim0, int ydim0, int xdim1, int ydim1) { //int ocl_fma = OCL_FMA; if(!isbuilt_update_halo_kernel2_yvel_plus_2_left) { buildOpenCLKernels(); //clSafeCall( clUnloadCompiler() ); cl_int ret; char* source_filename[1] = {"./OpenCL/update_halo_kernel2_yvel_plus_2_left.cl"}; // Load the kernel source code into the array source_str FILE *fid; char *source_str[1]; size_t source_size[1]; for(int i=0; i<1; i++) { fid = fopen(source_filename[i], "r"); if (!fid) { fprintf(stderr, "Can't open the kernel source file!\n"); exit(1); } source_str[i] = (char*)malloc(4*0x1000000); source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); if(source_size[i] != 4*0x1000000) { if (ferror(fid)) { printf ("Error while reading kernel source file %s\n", source_filename[i]); exit(-1); } if (feof(fid)) printf ("Kernel source file %s succesfuly read.\n", source_filename[i]); //printf("%s\n",source_str[i]); } fclose(fid); } printf("Compiling update_halo_kernel2_yvel_plus_2_left %d source -- start \n",OCL_FMA); // Create a program from the source OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); clSafeCall( ret ); // Build the program char buildOpts[255*3]; char* pPath = NULL; pPath = getenv ("OPS_INSTALL_PATH"); if (pPath!=NULL) if(OCL_FMA) sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d -Dxdim0_update_halo_kernel2_yvel_plus_2_left=%d -Dydim0_update_halo_kernel2_yvel_plus_2_left=%d -Dxdim1_update_halo_kernel2_yvel_plus_2_left=%d -Dydim1_update_halo_kernel2_yvel_plus_2_left=%d ", pPath, 32,xdim0,ydim0,xdim1,ydim1); else sprintf(buildOpts,"-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d -Dxdim0_update_halo_kernel2_yvel_plus_2_left=%d -Dydim0_update_halo_kernel2_yvel_plus_2_left=%d -Dxdim1_update_halo_kernel2_yvel_plus_2_left=%d -Dydim1_update_halo_kernel2_yvel_plus_2_left=%d ", pPath, 32,xdim0,ydim0,xdim1,ydim1); else { sprintf("Incorrect OPS_INSTALL_PATH %s\n",pPath); exit(EXIT_FAILURE); } ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); if(ret != CL_SUCCESS) { char* build_log; size_t log_size; clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); build_log = (char*) malloc(log_size+1); clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); build_log[log_size] = '\0'; fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); fprintf(stderr, "\n========================================================= \n"); free(build_log); exit(EXIT_FAILURE); } printf("compiling update_halo_kernel2_yvel_plus_2_left -- done\n"); // Create the OpenCL kernel OPS_opencl_core.kernel[70] = clCreateKernel(OPS_opencl_core.program, "ops_update_halo_kernel2_yvel_plus_2_left", &ret); clSafeCall( ret ); isbuilt_update_halo_kernel2_yvel_plus_2_left = true; } }
int MemoryOptimizations::genBinaryImage() { cl_int status = CL_SUCCESS; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } char platformName[100]; for (unsigned i = 0; i < numPlatforms; ++i) { status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(platformName), platformName, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) { break; } } std::cout << "Platform found : " << platformName << "\n"; delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[5] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_CONTEXT_OFFLINE_DEVICES_AMD, (cl_context_properties)1, 0 }; context = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); kernelPath.append("MemoryOptimizations_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) { return SDK_FAILURE; } /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); size_t numDevices; status = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(numDevices), &numDevices, NULL ); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetProgramInfo(CL_PROGRAM_NUM_DEVICES) failed.")) { return SDK_FAILURE; } std::cout << "Number of devices found : " << numDevices << "\n\n"; devices = (cl_device_id *)malloc( sizeof(cl_device_id) * numDevices ); if(devices == NULL) { sampleCommon->error("Failed to allocate host memory.(devices)"); return SDK_FAILURE; } /* grab the handles to all of the devices in the program. */ status = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * numDevices, devices, NULL ); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetProgramInfo(CL_PROGRAM_DEVICES) failed.")) { return SDK_FAILURE; } /* figure out the sizes of each of the binaries. */ size_t *binarySizes = (size_t*)malloc( sizeof(size_t) * numDevices ); if(devices == NULL) { sampleCommon->error("Failed to allocate host memory.(binarySizes)"); return SDK_FAILURE; } status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * numDevices, binarySizes, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetProgramInfo(CL_PROGRAM_BINARY_SIZES) failed.")) { return SDK_FAILURE; } size_t i = 0; /* copy over all of the generated binaries. */ char **binaries = (char **)malloc( sizeof(char *) * numDevices ); if(binaries == NULL) { sampleCommon->error("Failed to allocate host memory.(binaries)"); return SDK_FAILURE; } for(i = 0; i < numDevices; i++) { if(binarySizes[i] != 0) { binaries[i] = (char *)malloc( sizeof(char) * binarySizes[i]); if(binaries[i] == NULL) { sampleCommon->error("Failed to allocate host memory.(binaries[i])"); return SDK_FAILURE; } } else { binaries[i] = NULL; } } status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char *) * numDevices, binaries, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetProgramInfo(CL_PROGRAM_BINARIES) failed.")) { return SDK_FAILURE; } /* dump out each binary into its own separate file. */ for(i = 0; i < numDevices; i++) { char fileName[100]; sprintf(fileName, "%s.%d", dumpBinary.c_str(), (int)i); if(binarySizes[i] != 0) { char deviceName[1024]; status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo(CL_DEVICE_NAME) failed.")) { return SDK_FAILURE; } printf( "%s binary kernel: %s\n", deviceName, fileName); streamsdk::SDKFile BinaryFile; if(!BinaryFile.writeBinaryToFile(fileName, binaries[i], binarySizes[i])) { std::cout << "Failed to load kernel file : " << fileName << std::endl; return SDK_FAILURE; } } else { printf("Skipping %s since there is no binary data to write!\n", fileName); } } // Release all resouces and memory for(i = 0; i < numDevices; i++) { if(binaries[i] != NULL) { free(binaries[i]); binaries[i] = NULL; } } if(binaries != NULL) { free(binaries); binaries = NULL; } if(binarySizes != NULL) { free(binarySizes); binarySizes = NULL; } if(devices != NULL) { free(devices); devices = NULL; } status = clReleaseProgram(program); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) { return SDK_FAILURE; } status = clReleaseContext(context); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clReleaseContext failed.")) { return SDK_FAILURE; } return SDK_SUCCESS; }
cl_kernel load_kernel (char * name, char * kernel_name, cl_device_id dev_id, cl_context context, int * error) { cl_kernel kernel; FILE * fp = fopen(name, "r"); if(fp==NULL) { *error = 1; return NULL; } char * source; size_t size; fseek(fp, 0, SEEK_END); size = ftell(fp); fseek(fp, 0, SEEK_SET); source = malloc((size+1)*sizeof(char)); size = fread(source, 1, size, fp); fclose(fp); source[size] = '\0'; cl_int err; cl_program program = clCreateProgramWithSource (context, 1, (const char **) &source, &size, &err); if(err!=CL_SUCCESS) { *error = 1; return NULL; } clBuildProgram(program, 0, NULL, NULL, NULL, NULL); cl_build_status status; clGetProgramBuildInfo(program, dev_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); if(status!=CL_BUILD_SUCCESS) { *error = 1; return NULL; } kernel = clCreateKernel(program, kernel_name , &err); if(err!=CL_SUCCESS) { *error = 1; return NULL; } *error = 0; free ( source ); clReleaseProgram(program); return kernel; }
static int build_program_from_file(const char *filename, const char *options, cl_context context, cl_device_id device, cl_program *program_out, cl_int *err) { cl_int _err; FILE *file; char *program_source = NULL; size_t program_source_size; cl_program program = NULL; char *build_log = NULL; assert(filename != NULL); assert(program_out != NULL); if (!err) err = &_err; file = fopen(filename, "r"); if (!file) { ERROR("Couldn't open file \"%s\"", filename); goto error; } if (fseek(file, 0L, SEEK_END)) { ERROR("Cannot determine file size of \"%s\"", filename); goto error; } program_source_size = ftell(file); if (fseek(file, 0L, SEEK_SET)) { ERROR("Cannot determine file size of \"%s\"", filename); goto error; } program_source = malloc(sizeof(*program_source) * (program_source_size + 1)); CHECK_ALLOCATION(program_source); if (fread(program_source, 1, program_source_size, file) != program_source_size) { ERROR("Failed to read file \"%s\"", filename); goto error; } program_source[program_source_size] = '\0'; fclose(file); program = clCreateProgramWithSource(context, 1, (const char **)&program_source, NULL, err); CHECK_CL_ERROR(*err); *err = clBuildProgram(program, 0, NULL, options, NULL, NULL); if (*err == CL_BUILD_PROGRAM_FAILURE) { size_t build_log_size; *err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size); CHECK_CL_ERROR(*err); build_log = malloc(sizeof(*build_log) * build_log_size); CHECK_ALLOCATION(build_log); *err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL); CHECK_CL_ERROR(*err); if (options) ERROR("Failed to build program in file \"%s\" with options \"%s\"", filename, options); else ERROR("Failed to build program in file \"%s\"", filename); debug_printf("================================== BUILD LOG ===================================\n" "%s", NULL, 0, LOGGING_MSG_ERROR, build_log); goto error; } CHECK_CL_ERROR(*err); *program_out = program; return 0; error: free(build_log); if (program) clReleaseProgram(program); *program_out = NULL; free(program_source); return -1; }
int main(int argc, char **argv) { int start,end; unsigned long p[64], c[64], k[56]; unsigned long res; build_samples (p, c, k, 0); set_low_keys(k); cl_platform_id cpPlatform; clGetPlatformIDs(1, &cpPlatform, NULL); cl_device_id cdDevice; clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); char cBuffer[1024]; clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DEVICE_NAME:\t\t%s\n", cBuffer); clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DRIVER_VERSION:\t%s\n\n", cBuffer); cl_uint compute_units; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS:\t%u\n", compute_units); size_t workitem_dims; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims); size_t workitem_size[3]; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]); size_t workgroup_size; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size); cl_uint clock_frequency; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL); printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency); cl_context GPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, NULL); cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, 0, NULL); cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, p, NULL); cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, c, NULL); cl_mem GPUVector3 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 56, k, NULL); cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned long), &res, NULL); size_t szKernelLength; char* cSourceCL = oclLoadProgSource("ocl_deseval.cl", "", &szKernelLength); cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1, (const char **)&cSourceCL, &szKernelLength, NULL); if (clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL)!=CL_SUCCESS) { char cBuffer[2048]; if(clGetProgramBuildInfo(OpenCLProgram,cdDevice,CL_PROGRAM_BUILD_LOG,sizeof(cBuffer),cBuffer,NULL)==CL_SUCCESS); printf("Build error:\n%s\n",cBuffer); exit(1); } cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "keysearch", NULL); clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector); clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1); clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2); clSetKernelArg(OpenCLVectorAdd, 3, sizeof(cl_mem), (void*)&GPUVector3); size_t WorkSize[1] = {1024}; start=clock(); for (int i=0; i<1024; i++) { //clEnqueueWriteBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, // 56 * sizeof(unsigned long), k, 0, NULL, NULL); clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL); //clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, // sizeof(unsigned long), &res, 0, NULL, NULL); if(res!=0) { printf("Key found\n"); //key_found(res,k); break; } increment_key (k); } end=clock(); clReleaseKernel(OpenCLVectorAdd); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUVector1); clReleaseMemObject(GPUVector2); clReleaseMemObject(GPUOutputVector); printf ("Searched %i keys in %.3f seconds\n", 1000000, ((double)(end-start))/CLOCKS_PER_SEC); return 0; }
double gpu_cgm_image(uint32_t* aList, uint32_t* bList, int aLength, int bLength, int keyLength, uint32_t** matches, char* clFile, int x, int y) { int gap = 0, myoffset = 0; cl_platform_id *platforms; cl_uint num_platforms = 0; cl_device_id *devices; cl_uint num_devices = 0; cl_context context; cl_command_queue command_queue; cl_image_format imgFormat; cl_mem aImg; cl_mem bImg; cl_mem res_buf; cl_program program; cl_kernel kernel; cl_uint *results; FILE *prgm_fptr; struct stat prgm_sbuf; char *prgm_data; size_t prgm_size; size_t offset; size_t count; const size_t global_work_size[] = { x, y }; const size_t origin[] = { 0, 0, 0 }; const size_t region[] = { aLength, 1, 1 }; cl_int ret; cl_uint i; cl_bool imageSupport; struct timeval t1, t2; double elapsedTime; results = malloc(sizeof(cl_uint) * aLength); imgFormat.image_channel_order = CL_RGBA; imgFormat.image_channel_data_type = CL_UNSIGNED_INT32; /* figure out how many CL platforms are available */ ret = clGetPlatformIDs(0, NULL, &num_platforms); if (CL_SUCCESS != ret) { print_error ("Error getting the number of platform IDs: %d", ret); exit(EXIT_FAILURE); } if (0 == num_platforms) { print_error ("No CL platforms were found."); exit(EXIT_FAILURE); } /* allocate space for each available platform ID */ if (NULL == (platforms = malloc((sizeof *platforms) * num_platforms))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get all of the platform IDs */ ret = clGetPlatformIDs(num_platforms, platforms, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting platform IDs: %d", ret); exit(EXIT_FAILURE); } /* find a platform that supports given device type */ // print_error ("Number of platforms found: %d", num_platforms); for (i = 0; i < num_platforms; i++) { ret = clGetDeviceIDs(platforms[i], getDeviceType(), 0, NULL, &num_devices); if (CL_SUCCESS != ret) continue; if (0 < num_devices) break; } /* make sure at least one device was found */ if (num_devices == 0) { print_error ("No CL device found that supports device type: %s.", ((getDeviceType() == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU")); exit(EXIT_FAILURE); } /* only one device is necessary... */ num_devices = 1; if (NULL == (devices = malloc((sizeof *devices) * num_devices))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get one device id */ ret = clGetDeviceIDs(platforms[i], getDeviceType(), num_devices, devices, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting device IDs: %d", ret); exit(EXIT_FAILURE); } ret = clGetDeviceInfo(*devices, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to get Device Info: %d", ret); exit(EXIT_FAILURE); } if(imageSupport == CL_FALSE) { print_error ("Failure: Images are not supported!"); exit(EXIT_FAILURE); } /* create a context for the CPU device that was found earlier */ context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &ret); if (NULL == context || CL_SUCCESS != ret) { print_error ("Failed to create context: %d", ret); exit(EXIT_FAILURE); } /* create a command queue for the CPU device */ command_queue = clCreateCommandQueue(context, devices[0], 0, &ret); if (NULL == command_queue || CL_SUCCESS != ret) { print_error ("Failed to create a command queue: %d", ret); exit(EXIT_FAILURE); } /* create buffers on the CL device */ aImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == aImg || CL_SUCCESS != ret) { print_error ("Failed to create a image: %d", ret); exit(EXIT_FAILURE); } bImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == bImg || CL_SUCCESS != ret) { print_error ("Failed to create b image: %d", ret); exit(EXIT_FAILURE); } int res_bufSize = aLength; res_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * res_bufSize, NULL, &ret); if (NULL == res_buf || CL_SUCCESS != ret) { print_error ("Failed to create b buffer: %d", ret); exit(EXIT_FAILURE); } /* read the opencl program code into a string */ prgm_fptr = fopen(clFile, "r"); if (NULL == prgm_fptr) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } if (0 != stat(clFile, &prgm_sbuf)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } prgm_size = prgm_sbuf.st_size; prgm_data = malloc(prgm_size); if (NULL == prgm_data) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* make sure all data is read from the file (just in case fread returns * short) */ offset = 0; while (prgm_size - offset != (count = fread(prgm_data + offset, 1, prgm_size - offset, prgm_fptr))) offset += count; if (0 != fclose(prgm_fptr)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } /* create a 'program' from the source */ program = clCreateProgramWithSource(context, 1, (const char **) &prgm_data, &prgm_size, &ret); if (NULL == program || CL_SUCCESS != ret) { print_error ("Failed to create program with source: %d", ret); exit(EXIT_FAILURE); } /* compile the program.. (it uses llvm or something) */ ret = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL); if (CL_SUCCESS != ret) { size_t size; char *log = calloc(1, 4000); if (NULL == log) { print_error ("Out of memory"); exit(EXIT_FAILURE); } print_error ("Failed to build program: %d", ret); ret = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 4096, log, &size); if (CL_SUCCESS != ret) { print_error ("Failed to get program build info: %d", ret); exit(EXIT_FAILURE); } fprintf(stderr, "Begin log:\n%s\nEnd log.\n", log); exit(EXIT_FAILURE); } /* pull out a reference to your kernel */ kernel = clCreateKernel(program, "cgm_kernel", &ret); if (NULL == kernel || CL_SUCCESS != ret) { print_error ("Failed to create kernel: %d", ret); exit(EXIT_FAILURE); } gettimeofday(&t1, NULL); /* write data to these buffers */ clEnqueueWriteImage(command_queue, aImg, CL_FALSE, origin, region, 0, 0, (void*) aImg, 0, NULL, NULL); clEnqueueWriteImage(command_queue, bImg, CL_FALSE, origin, region, 0, 0, (void*) bImg, 0, NULL, NULL); /* set your kernel's arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 4, sizeof(int), &gap); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 5, sizeof(int), &myoffset); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 6, sizeof(int), &keyLength); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), &res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } /* make sure buffers have been written before executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* enque this kernel for execution... */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue kernel: %d", ret); exit(EXIT_FAILURE); } /* wait for the kernel to finish executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* copy the contents of dev_buf from the CL device to the host (CPU) */ ret = clEnqueueReadBuffer(command_queue, res_buf, true, 0, sizeof(cl_uint) * aLength, results, 0, NULL, NULL); gettimeofday(&t2, NULL); elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms if (CL_SUCCESS != ret) { print_error ("Failed to copy data from device to host: %d", ret); exit(EXIT_FAILURE); } ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* make sure the content of the buffer are what we expect */ //for (i = 0; i < aLength; i++) // printf("%d\n", results[i]); /* free up resources */ ret = clReleaseKernel(kernel); if (CL_SUCCESS != ret) { print_error ("Failed to release kernel: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseProgram(program); if (CL_SUCCESS != ret) { print_error ("Failed to release program: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(aImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(bImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseCommandQueue(command_queue))) { print_error ("Failed to release command queue: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseContext(context))) { print_error ("Failed to release context: %d", ret); exit(EXIT_FAILURE); } matches = &results; return elapsedTime; }
/// // main() for Convoloution example // int main(int argc, char** argv) { cl_int errNum; cl_uint numPlatforms; cl_uint numDevices; cl_platform_id * platformIDs; cl_device_id * deviceIDs; cl_context context = NULL; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem inputSignalBuffer; cl_mem outputSignalBuffer; cl_mem maskBuffer; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca( sizeof(cl_platform_id) * numPlatforms); errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); // Iterate through the list of platforms until we find one that supports // a CPU device, otherwise fail with an error. deviceIDs = NULL; cl_uint i; for (i = 0; i < numPlatforms; i++) { errNum = clGetDeviceIDs( platformIDs[i], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) { checkErr(errNum, "clGetDeviceIDs"); } else if (numDevices > 0) { deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[i], CL_DEVICE_TYPE_CPU, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); break; } } // Check to see if we found at least one CPU device, otherwise return if (deviceIDs == NULL) { std::cout << "No CPU device found" << std::endl; exit(-1); } // Next, create an OpenCL context on the selected platform. cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[i], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum); checkErr(errNum, "clCreateContext"); std::ifstream srcFile("../convolution/Convolution.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, NULL, NULL, NULL); if (errNum != CL_SUCCESS) { // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in kernel: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // Create kernel object kernel = clCreateKernel( program, "convolve", &errNum); checkErr(errNum, "clCreateKernel"); // Now allocate buffers inputSignalBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight * inputSignalWidth, static_cast<void *>(inputSignal), &errNum); checkErr(errNum, "clCreateBuffer(inputSignal)"); maskBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void *>(mask), &errNum); checkErr(errNum, "clCreateBuffer(mask)"); outputSignalBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum); checkErr(errNum, "clCreateBuffer(outputSignal)"); // Pick the first device and create command queue. queue = clCreateCommandQueue( context, deviceIDs[0], 0, &errNum); checkErr(errNum, "clCreateCommandQueue"); errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer); errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth); errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth); checkErr(errNum, "clSetKernelArg"); const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalHeight }; const size_t localWorkSize[1] = { 1 }; // Queue the kernel up for execution across the array errNum = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); checkErr(errNum, "clEnqueueNDRangeKernel"); errNum = clEnqueueReadBuffer( queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalHeight, outputSignal, 0, NULL, NULL); checkErr(errNum, "clEnqueueReadBuffer"); // Output the result buffer for (int y = 0; y < outputSignalHeight; y++) { for (int x = 0; x < outputSignalWidth; x++) { std::cout << outputSignal[x][y] << " "; } std::cout << std::endl; } std::cout << std::endl << "Executed program succesfully." << std::endl; return 0; }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_device_id device; cl_context context; cl_command_queue queue; cl_uint numOfPlatforms; cl_int error; cl_mem matrixAMemObj; // input matrix A mem buffer cl_mem matrixBMemObj; // input matrix B mem buffer cl_mem matrixCMemObj; // input matrix C mem buffer cl_int* matrixA; // input matrix A cl_int* matrixB; // input matrix B cl_int* matrixC; // input matrix C cl_uint widthA = WIDTH_G; cl_uint heightA = HEIGHT_G; cl_uint widthB = WIDTH_G; cl_uint heightB = HEIGHT_G; { // allocate memory for input and output matrices // based on whatever matrix theory i know. matrixA = (cl_int*)malloc(widthA * heightA * sizeof(cl_int)); matrixB = (cl_int*)malloc(widthB * heightB * sizeof(cl_int)); matrixC = (cl_int*)malloc(widthB * heightA * sizeof(cl_int)); memset(matrixA, 0, widthA * heightA * sizeof(cl_int)); memset(matrixB, 0, widthB * heightB * sizeof(cl_int)); memset(matrixC, 0, widthB * heightA * sizeof(cl_int)); fillRandom(matrixA, widthA, heightA, 643); fillRandom(matrixB, widthB, heightB, 991); } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_int i = 0; i < numOfPlatforms; i++ ) { // Get the GPU device error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(error != CL_SUCCESS) { perror("Can't locate a OpenCL compliant device i.e. GPU"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* Load the two source files into temporary datastores */ const char *file_names[] = {"mmult.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; const char options[] = ""; size_t log_size; error = clBuildProgram(program, 1, &device, options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } // Queue is created with profiling enabled cl_command_queue_properties props; props |= CL_QUEUE_PROFILING_ENABLE; queue = clCreateCommandQueue(context, device, props, &error); cl_kernel kernel = clCreateKernel(program, "mmmult", &error); matrixAMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthA * heightA * sizeof(cl_int), matrixA, &error); matrixBMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthB * heightB * sizeof(cl_int), matrixB, &error); matrixCMemObj = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, widthB * heightA * sizeof(cl_int), 0, &error); clSetKernelArg(kernel, 0, sizeof(cl_int),(void*)&widthB); clSetKernelArg(kernel, 1, sizeof(cl_int),(void*)&heightA); clSetKernelArg(kernel, 2, sizeof(cl_mem),(void*)&matrixAMemObj); clSetKernelArg(kernel, 3, sizeof(cl_mem),(void*)&matrixBMemObj); clSetKernelArg(kernel, 4, sizeof(cl_mem),(void*)&matrixCMemObj); size_t globalThreads[] = {heightA}; size_t localThreads[] = {256}; cl_event exeEvt; cl_ulong executionStart, executionEnd; error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &exeEvt); clWaitForEvents(1, &exeEvt); if(error != CL_SUCCESS) { printf("Kernel execution failure!\n"); exit(-22); } // let's understand how long it took? clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_START, sizeof(executionStart), &executionStart, NULL); clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_END, sizeof(executionEnd), &executionEnd, NULL); clReleaseEvent(exeEvt); printf("Execution the matrix-matrix multiplication took %lu.%lu s\n", (executionEnd - executionStart)/1000000000, (executionEnd - executionStart)%1000000000); printf("Execution the matrix-matrix multiplication took %lu s\n", (executionEnd - executionStart)); clEnqueueReadBuffer(queue, matrixCMemObj, CL_TRUE, 0, widthB * heightA * sizeof(cl_int), matrixC, 0, NULL, NULL); if (compare(matrixC, matrixA, matrixB, heightA, widthA, widthB)) printf("Passed!\n"); else printf("Failed!\n"); /* Clean up */ for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); clReleaseContext(context); clReleaseMemObject(matrixAMemObj); clReleaseMemObject(matrixBMemObj); clReleaseMemObject(matrixCMemObj); } free(matrixA); free(matrixB); free(matrixC); }
static void clrpc_client_test2(void) { int err; int size = 1024; cl_uint nplatforms = 0; cl_platform_id* platforms = 0; cl_uint nplatforms_ret; clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); printf( "after call one i get nplatforms_ret = %d", nplatforms_ret); if (nplatforms_ret == 0) exit(1); nplatforms = nplatforms_ret; platforms = (cl_platform_id*)calloc(nplatforms,sizeof(cl_platform_id)); clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); int i; for(i=0;i<nplatforms;i++) { clrpc_dptr* tmp = ((_xobj_t*)platforms[i])->obj; int is_rpc; if ( clGetPlatformInfo(platforms[i],999,sizeof(cl_int),&is_rpc,0)==CL_SUCCESS) { printf( "platforms[%d] local=%p remote=%p\n", i,(void*)tmp->local, (void*)tmp->remote); } else { printf( "platforms[%d] not RPC\n",i); } } char buffer[1024]; size_t sz; cl_platform_id rpc_platform = 0; for(i=0;i<nplatforms;i++) { clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } int iplat; for(iplat=0;iplat<nplatforms;iplat++) { printf("\n******************\nTEST PLATFORM %d\n*************\n\n",iplat); cl_uint ndevices = 0; cl_device_id* devices = 0; cl_uint ndevices_ret; clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); printf( "after call one i get ndevices_ret = %d\n", ndevices_ret); if (ndevices_ret > 10) exit(-1); ndevices = ndevices_ret; devices = (cl_device_id*)calloc(ndevices,sizeof(cl_device_id)); clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); if (!ndevices_ret) { //printf("no devices, stopping.\n"); //exit(1); printf("no devices, skipping.\n"); continue; } for(i=0;i<ndevices;i++) { clrpc_dptr* tmp = ((_xobj_t*)devices[i])->obj; clGetDeviceInfo(devices[i],CL_DEVICE_NAME,1023,buffer,&sz); printf( "CL_DEVICE_NAME |%s|\n",buffer); cl_platform_id tmpid; clGetDeviceInfo(devices[i],CL_DEVICE_PLATFORM,sizeof(tmpid),&tmpid,&sz); printf("%p\n",platforms[iplat]); fflush(stdout); printf("%p\n",tmpid); fflush(stdout); clGetPlatformInfo(tmpid,CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[iplat], 0 }; printf("i am setting this: prop[%d] %p\n",iplat,platforms[iplat]); cl_context ctx = clCreateContext(ctxprop,ndevices,devices, 0,0,&err); cl_command_queue* cmdq = (cl_command_queue*) calloc(ndevices,sizeof(cl_command_queue)); for(i=0;i<ndevices;i++) { cmdq[i] = clCreateCommandQueue(ctx,devices[i],0,&err); printf( "cmdq %d %p",i,cmdq[i]); } cl_mem a_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem b_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem c_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem d_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); int* a = (int*)malloc(1024*sizeof(int)); int* b = (int*)malloc(1024*sizeof(int)); int* c = (int*)malloc(1024*sizeof(int)); int* d = (int*)malloc(1024*sizeof(int)); char* prgsrc[] = { "__kernel void my_kern( int n, __global int* a, __global int* b )\n" " { int i = get_global_id(0); int tmp = 0; int j; for(j=0;j<n;j++) tmp += a[i] * a[j]; b[i] = tmp; }\n" }; size_t prgsrc_sz = strlen(prgsrc[0]) + 1; cl_program prg = clCreateProgramWithSource(ctx,1, (const char**)prgsrc,&prgsrc_sz,&err); clBuildProgram(prg,ndevices,devices,0,0,0); cl_kernel krn = clCreateKernel(prg,"my_kern",&err); int idev; for(idev=0;idev<ndevices;idev++) { printf("\n******************\nTEST DEVICE %d(%d)\n*************\n\n",idev,iplat); for(i=0;i<size;i++) a[i] = i*10; for(i=0;i<size;i++) b[i] = i*10+1; for(i=0;i<size;i++) c[i] = 0; for(i=0;i<size;i++) d[i] = 0; cl_event ev[8]; for(i=0;i<32;i++) printf("%d/",a[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",b[i]); printf("\n"); clEnqueueWriteBuffer(cmdq[idev],a_buf,CL_FALSE,0,size*sizeof(int),a, 0,0,&ev[0]); clEnqueueWriteBuffer(cmdq[idev],b_buf,CL_FALSE,0,size*sizeof(int),b, 1,ev,&ev[1]); clEnqueueWriteBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 2,ev,&ev[2]); clEnqueueWriteBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 3,ev,&ev[3]); size_t offset = 0; size_t gwsz = 128; size_t lwsz = 16; clSetKernelArg(krn,0,sizeof(int),&size); clSetKernelArg(krn,1,sizeof(cl_mem),&a_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&c_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,4,ev,&ev[4]); clSetKernelArg(krn,1,sizeof(cl_mem),&b_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&d_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,5,ev,&ev[5]); clEnqueueReadBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 6,ev,&ev[6]); clEnqueueReadBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 7,ev,&ev[7]); clFlush(cmdq[idev]); clWaitForEvents(8,ev); for(i=0;i<32;i++) printf("%d/",c[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",d[i]); printf("\n"); for(i=0;i<8;i++) clReleaseEvent(ev[i]); } clReleaseKernel(krn); clReleaseProgram(prg); clReleaseMemObject(a_buf); clReleaseMemObject(b_buf); clReleaseMemObject(c_buf); clReleaseMemObject(d_buf); clReleaseCommandQueue(cmdq[0]); clReleaseContext(ctx); // printf("sleeping ...\n"); // sleep(1); } // clrpc_final(); }
int exec_trig_kernel(const char *program_source, int n, void *srcA, void *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[2]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; float c = 7.3f; // a scalar number to test non-pointer args // create the OpenCL context on a GPU device context = poclu_create_any_context(); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } free(devices); // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * n, NULL, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "trig", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(float), (void *) &c); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 2; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 0, n * sizeof(cl_float4), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // release kernel, program, and memory objects delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
void maxcoinMiner_openCL_generateOrUpdateKernel() { if( maxcoinOpenCLKernelInited ) return; maxcoinOpenCLKernelInited = true; printf("Compiling OpenCL kernel...\n"); char* kernel_src = (char*)malloc(1024*512); strcpy(kernel_src, ""); cl_int clerr = 0; // init input buffer maxcoinGPU.buffer_blockInputData = malloc(80+8); // endian swapped block data + share target attached at the end memset(maxcoinGPU.buffer_blockInputData, 0x00, 80+8); maxcoinGPU.clBuffer_blockInputData = clCreateBuffer(openCL_getActiveContext(), CL_MEM_READ_WRITE, 88, maxcoinGPU.buffer_blockInputData, &clerr); // init output buffer sint32 outputBlocks = 256; maxcoinGPU.buffer_nonceOutputData = malloc(outputBlocks*4*sizeof(uint32)); memset(maxcoinGPU.buffer_nonceOutputData, 0x00, outputBlocks*4*sizeof(uint32)); maxcoinGPU.clBuffer_nonceOutputData = clCreateBuffer(openCL_getActiveContext(), CL_MEM_READ_WRITE, outputBlocks*4*sizeof(uint32), maxcoinGPU.buffer_nonceOutputData, &clerr); maxcoinMiner_openCL_appendKeccakFunction(kernel_src); strcat(kernel_src, "__kernel void xptMiner_cl_maxcoin_keccak(__global unsigned long *blockData, __global unsigned int *nonceIndexOut)\r\n"); strcat(kernel_src, "{\r\n"); strcat(kernel_src, "unsigned long nonceAndBits = blockData[9] & 0x00000000FFFFFFFF;\r\n" "unsigned long shareTarget = blockData[10];\r\n" "nonceIndexOut[get_local_id(0)] = 0xFFFFFFFF;\r\n" "nonceAndBits += 0x100000000UL*0x1000UL*(unsigned long)get_local_id(0);\r\n" //"nonceAndBits = 0x01f94bdb00000000UL;\r\n" "for(int i=0; i<0x1000; i++) {\r\n" //"for(int i=0; i<1; i++) {\r\n" //"if( keccak256_maxcoin_opt_v(blockData, nonceAndBits) < shareTarget ) nonceIndexOut[0] = nonceAndBits>>32;\r\n" "if( keccak256_maxcoin_opt_v(blockData, nonceAndBits) < 0x0040000000000000UL ) nonceIndexOut[get_local_id(0)] = nonceAndBits>>32;\r\n" "nonceAndBits += 0x100000000UL;\r\n" "}\r\n"); strcat(kernel_src, "}\r\n"); const char* source = kernel_src; size_t src_size = strlen(kernel_src); cl_program program = clCreateProgramWithSource(openCL_getActiveContext(), 1, &source, &src_size, &clerr); if(clerr != CL_SUCCESS) printf("Error creating OpenCL program\n"); // builds the program clerr = clBuildProgram(program, 1, openCL_getActiveDeviceID(), NULL, NULL, NULL); if(clerr != CL_SUCCESS) printf("Error compiling OpenCL program\n"); // shows the log char* build_log; size_t log_size; // First call to know the proper size clGetProgramBuildInfo(program, *openCL_getActiveDeviceID(), CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); build_log = (char*)malloc(log_size+1); memset(build_log, 0x00, log_size+1); // Second call to get the log clGetProgramBuildInfo(program, *openCL_getActiveDeviceID(), CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); build_log[log_size] = '\0'; puts(build_log); free(build_log); maxcoinGPU.kernel_keccak = clCreateKernel(program, "xptMiner_cl_maxcoin_keccak", &clerr); clerr = clSetKernelArg(maxcoinGPU.kernel_keccak, 0, sizeof(cl_mem), &maxcoinGPU.clBuffer_blockInputData); clerr = clSetKernelArg(maxcoinGPU.kernel_keccak, 1, sizeof(cl_mem), &maxcoinGPU.clBuffer_nonceOutputData); free(kernel_src); }
/* Initialize OpenCl processing */ void init_cl() { char *program_buffer, *program_log; size_t program_size, log_size; int err; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn't identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(err == CL_DEVICE_NOT_FOUND) { err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); } if(err < 0) { perror("Couldn't access any devices"); exit(1); } /* Create OpenCL context properties */ #ifdef MAC CGLContextObj mac_context = CGLGetCurrentContext(); CGLShareGroupObj group = CGLGetShareGroup(mac_context); cl_context_properties properties[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)group, 0}; #else #ifdef UNIX cl_context_properties properties[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; #else cl_context_properties properties[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; #endif #endif /* Create context */ context = clCreateContext(properties, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Create program from file */ program_buffer = read_file(PROGRAM_FILE, &program_size); program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err); if(err < 0) { perror("Couldn't create the program"); exit(1); } free(program_buffer); /* Build program */ err = clBuildProgram(program, 0, NULL, "-DRADIUS=0.75", NULL, NULL); if(err < 0) { /* Find size of log and print to std output */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size + 1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Create kernel */ kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { printf("Couldn't create a kernel: %d", err); exit(1); }; }
int main(int argc, char **argv) { /* test name */ char name[] = "test_sampler_address_clamp"; size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 }; size_t srcdir_length, name_length, filename_size; char *filename = NULL; char *source = NULL; cl_device_id devices[1]; cl_context context = NULL; cl_command_queue queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_int result; int retval = -1; /* image parameters */ cl_uchar4 *imageData; cl_image_format image_format; cl_image_desc image_desc; printf("Running test %s...\n", name); memset(&image_desc, 0, sizeof(cl_image_desc)); image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; image_desc.image_width = 4; image_desc.image_height = 4; image_format.image_channel_order = CL_RGBA; image_format.image_channel_data_type = CL_UNSIGNED_INT8; imageData = (cl_uchar4*)malloc (4 * 4 * sizeof(cl_uchar4)); if (imageData == NULL) { puts("out of host memory\n"); goto error; } memset (imageData, 1, 4*4*sizeof(cl_uchar4)); /* determine file name of kernel source to load */ srcdir_length = strlen(SRCDIR); name_length = strlen(name); filename_size = srcdir_length + name_length + 16; filename = (char *)malloc(filename_size + 1); if (!filename) { puts("out of memory"); goto error; } snprintf(filename, filename_size, "%s/%s.cl", SRCDIR, name); /* read source code */ source = poclu_read_file (filename); TEST_ASSERT (source != NULL && "Kernel .cl not found."); /* setup an OpenCL context and command queue using default device */ context = poclu_create_any_context(); if (!context) { puts("clCreateContextFromType call failed\n"); goto error; } result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), devices, NULL); if (result != CL_SUCCESS) { puts("clGetContextInfo call failed\n"); goto error; } queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queue) { puts("clCreateCommandQueue call failed\n"); goto error; } /* Create image */ cl_mem image = clCreateImage (context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &image_format, &image_desc, imageData, &result); if (result != CL_SUCCESS) { puts("image creation failed\n"); goto error; } /* create and build program */ program = clCreateProgramWithSource (context, 1, (const char **)&source, NULL, NULL); if (!program) { puts("clCreateProgramWithSource call failed\n"); goto error; } result = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (result != CL_SUCCESS) { puts("clBuildProgram call failed\n"); goto error; } /* execute the kernel with give name */ kernel = clCreateKernel(program, name, NULL); if (!kernel) { puts("clCreateKernel call failed\n"); goto error; } result = clSetKernelArg( kernel, 0, sizeof(cl_mem), &image); if (result) { puts("clSetKernelArg failed\n"); goto error; } result = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (result != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } result = clFinish(queue); if (result == CL_SUCCESS) retval = 0; error: if (image) { clReleaseMemObject (image); } if (kernel) { clReleaseKernel(kernel); } if (program) { clReleaseProgram(program); } if (queue) { clReleaseCommandQueue(queue); } if (context) { clUnloadCompiler (); clReleaseContext (context); } if (source) { free(source); } if (filename) { free(filename); } if (imageData) { free(imageData); } if (retval) { printf("FAIL\n"); return 1; } printf("OK\n"); return 0; }
static ConvolveInfo *GetConvolveInfo(const Image *image,const char *name, const char *source,ExceptionInfo *exception) { char options[MaxTextExtent]; cl_int status; ConvolveInfo *convolve_info; size_t length, lengths[] = { strlen(source) }; /* Create OpenCL info. */ convolve_info=(ConvolveInfo *) AcquireAlignedMemory(1,sizeof(*convolve_info)); if (convolve_info == (ConvolveInfo *) NULL) { (void) ThrowMagickException(exception,GetMagickModule(), ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename); return((ConvolveInfo *) NULL); } (void) ResetMagickMemory(convolve_info,0,sizeof(*convolve_info)); /* Create OpenCL context. */ convolve_info->context=clCreateContextFromType((cl_context_properties *) NULL,CL_DEVICE_TYPE_GPU,ConvolveNotify,exception,&status); if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS)) convolve_info->context=clCreateContextFromType((cl_context_properties *) NULL,CL_DEVICE_TYPE_CPU,ConvolveNotify,exception,&status); if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS)) convolve_info->context=clCreateContextFromType((cl_context_properties *) NULL,CL_DEVICE_TYPE_DEFAULT,ConvolveNotify,exception,&status); if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS)) { (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning, "failed to create OpenCL context","`%s' (%d)",image->filename,status); DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } /* Detect OpenCL devices. */ status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,0,NULL, &length); if ((status != CL_SUCCESS) || (length == 0)) { DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } convolve_info->devices=(cl_device_id *) AcquireMagickMemory(length); if (convolve_info->devices == (cl_device_id *) NULL) { (void) ThrowMagickException(exception,GetMagickModule(), ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename); DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,length, convolve_info->devices,NULL); if (status != CL_SUCCESS) { DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } /* Create OpenCL command queue. */ convolve_info->command_queue=clCreateCommandQueue(convolve_info->context, convolve_info->devices[0],0,&status); if ((convolve_info->command_queue == (cl_command_queue) NULL) || (status != CL_SUCCESS)) { DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } /* Build OpenCL program. */ convolve_info->program=clCreateProgramWithSource(convolve_info->context,1, &source,lengths,&status); if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS)) { DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } (void) FormatMagickString(options,MaxTextExtent,CLOptions,(double) QuantumRange,MagickEpsilon); status=clBuildProgram(convolve_info->program,1,convolve_info->devices,options, NULL,NULL); if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS)) { char *log; status=clGetProgramBuildInfo(convolve_info->program, convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&length); log=(char *) AcquireMagickMemory(length); if (log == (char *) NULL) { DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } status=clGetProgramBuildInfo(convolve_info->program, convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,length,log,&length); (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning, "failed to build OpenCL program","`%s' (%s)",image->filename,log); log=DestroyString(log); DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } /* Get a kernel object. */ convolve_info->kernel=clCreateKernel(convolve_info->program,name,&status); if ((convolve_info->kernel == (cl_kernel) NULL) || (status != CL_SUCCESS)) { DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } return(convolve_info); }
int main() { // START:context cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); // END:context // START:queue cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); // END:queue // START:kernel char* source = read_source("multiply_arrays.cl"); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); free(source); clBuildProgram(program, 0, NULL, NULL, NULL, NULL); cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL); // END:kernel // START:buffers cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS]; random_fill(a, NUM_ELEMENTS); random_fill(b, NUM_ELEMENTS); cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, a, NULL); cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, b, NULL); cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL); // END:buffers // START:execute clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA); clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB); clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); size_t work_units = NUM_ELEMENTS; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL); // END:execute // START:results cl_float results[NUM_ELEMENTS]; clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS, results, 0, NULL, NULL); // END:results // START:cleanup clReleaseMemObject(inputA); clReleaseMemObject(inputB); clReleaseMemObject(output); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); // END:cleanup for (int i = 0; i < NUM_ELEMENTS; ++i) { printf("%f * %f = %f\n", a[i], b[i], results[i]); } return 0; }
int main(int argc, char *argv[]) { //FILE *fp; cl_platform_id platform_id[2]; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret_code; cl_mem image_in_mem = NULL; cl_mem image_out_mem = NULL; cl_mem twiddle_factors_mem = NULL; cl_float2 *image_in_host; cl_float2 *twiddle_factors_host; cl_kernel kernel_twiddle_factors; cl_kernel kernel_matriz_transpose; cl_kernel kernel_lowpass_filter; pgm_t ipgm; pgm_t opgm; image_file_t *image_filename; char *output_filename; FILE *fp; const char *kernel_filename = C_NOME_ARQ_KERNEL; size_t source_size; char *source_str; cl_int i, j,n ,m; cl_int raio = 0; size_t global_wg[2]; size_t local_wg[2]; float *image_amplitudes; size_t log_size; char *log_file; cl_event kernels_events_out_fft[4]; cl_ulong kernel_runtime = (cl_ulong) 0; cl_ulong kernel_start_time = (cl_ulong) 0; cl_ulong kernel_end_time = (cl_ulong) 0; cl_event write_host_dev_event; cl_ulong write_host_dev_start_time = (cl_ulong) 0; cl_ulong write_host_dev_end_time = (cl_ulong) 0; cl_ulong write_host_dev_run_time = (cl_ulong) 0; cl_event read_dev_host_event; cl_ulong read_dev_host_start_time = (cl_ulong) 0; cl_ulong read_dev_host_end_time = (cl_ulong) 0; cl_ulong read_dev_host_run_time = (cl_ulong) 0; unsigned __int64 image_tam; unsigned __int64 MEGA_BYTES = 1048576; // 1024*1024 double image_tam_MB; double tempo_total; struct event_in_fft_t *fft_events; //=== Timer count start ============================================================================== timer_reset(); timer_start(); //=================================================================================================== if (argc < 2) { printf("**Erro: O arquivo de entrada eh necessario.\n"); exit(EXIT_FAILURE); } image_filename = (image_file_t *) malloc(sizeof(image_file_t)); split_image_filename(image_filename, argv[1]); output_filename = (char *) malloc(40*sizeof(char)); sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO); fp = fopen(kernel_filename, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(EXIT_FAILURE); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); //=================================================================================================== /* Abrindo imagem do arquivo para objeto de memoria local*/ if( ler_pgm(&ipgm, argv[1]) == -1) exit(EXIT_FAILURE); n = ipgm.width; raio = n/8; m = (cl_int)(log((double)n)/log(2.0)); image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2)); twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j]; image_in_host[n*i + j].s[1] = (float)0; } } fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t)); kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event)); //=================================================================================================== CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms)); if (ret_num_platforms == 0 ) { fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n"); exit(2); } //=================================================================================================== CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices)); //print_platform_info(&platform_id[1]); //=================================================================================================== context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code); //=================================================================================================== cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code); //=================================================================================================== image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code); //=================================================================================================== /* Transfer data to memory buffer */ CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event)); image_tam = n*n*sizeof(cl_float2); //=================================================================================================== program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code); //=================================================================================================== ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //=================================================================================================== if (ret_code != CL_SUCCESS) { // Determine the size of the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); //=================================================================================================== // Allocate memory for the log log_file = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL); printf("%s\n", log_file); system("pause"); exit(0); } kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code); kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code); kernel_lowpass_filter = clCreateKernel(program, "lowpass_filter", &ret_code); /* Processa os fatores Wn*/ //=================================================================================================== CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem)); CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n/2, 1); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0])); //=================================================================================================== /* Executa a FFT em N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1])); //=================================================================================================== /* Executa a FFT N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]); //=================================================================================================== /* Processa o filtro passa baixa */ CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2])); //=================================================================================================== /* Obtem a FFT inversa*/ fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3])); //=================================================================================================== fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]); //=================================================================================================== CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event)); //=================================================================================================== //== Total time elapsed ============================================================================ timer_stop(); tempo_total = get_elapsed_time(); //================================================================================================== //====== Get time of Profile Info ================================================================== // Write data time CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL)); // Read data time CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL)); for (i = 0; i < MAX_CALL_FFT; i++) { kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; if (fft_events[i].kernel_normalize != NULL) { CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } } for (j=0; j < MAX_CALL_FFT*m; j++){ kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time; read_dev_host_run_time = read_dev_host_end_time - read_dev_host_start_time; /* save_log_debug(write_host_dev_run_time,fp); save_log_debug(read_dev_host_run_time,fp); close_log_debug(fp); */ image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES); //================================================================================================== save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)), (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME); //=================================================================================================== image_amplitudes = (float*)malloc(n*n*sizeof(float)); for (i=0; i < n; i++) { for (j=0; j < n; j++) { image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1])); } } //clFlush(cmd_queue); //clFinish(cmd_queue); opgm.width = n; opgm.height = n; normalizar_pgm(&opgm, image_amplitudes); escrever_pgm(&opgm, output_filename); //=================================================================================================== clFinish(cmd_queue); clReleaseKernel(kernel_twiddle_factors); clReleaseKernel(kernel_matriz_transpose); clReleaseKernel(kernel_lowpass_filter); clReleaseProgram(program); clReleaseMemObject(image_in_mem); clReleaseMemObject(image_out_mem); clReleaseMemObject(twiddle_factors_mem); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); clReleaseEvent(read_dev_host_event); clReleaseEvent(write_host_dev_event); clReleaseEvent(kernels_events_out_fft[0]); clReleaseEvent(kernels_events_out_fft[1]); clReleaseEvent(kernels_events_out_fft[2]); clReleaseEvent(kernels_events_out_fft[3]); destruir_pgm(&ipgm); destruir_pgm(&opgm); free(image_amplitudes); free(source_str); free(image_in_host); free(image_filename); free(twiddle_factors_host); free(output_filename); free(fft_events); free(kernel_butter_events); //_CrtDumpMemoryLeaks(); return 0; }
// Helper function to create and build program and kernel // ********************************************************************* cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2) { // compile cl program size_t program_length; char *source; std::ostringstream preamble; // create the program // with type specification depending on datatype argument switch (datatype) { default: case REDUCE_INT: preamble << "#define T int" << std::endl; break; case REDUCE_FLOAT: preamble << "#define T float" << std::endl; break; } // set blockSize at compile time preamble << "#define blockSize " << blockSize << std::endl; // set isPow2 at compile time preamble << "#define nIsPow2 " << isPowOf2 << std::endl; // Load the source code and prepend the preamble source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length); oclCheckError(source != NULL, shrTRUE); cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclReduction.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } // create Kernel std::ostringstream kernelName; kernelName << "reduce" << whichKernel; cl_kernel ckKernel = clCreateKernel(cpProgram, kernelName.str().c_str(), &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); size_t wgSize; ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); if (wgSize == 64) smallBlock = true; else smallBlock = false; // NOTE: the program will get deleted when the kernel is also released clReleaseProgram(cpProgram); return ckKernel; }
/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); FILE *kernels; char *source_str; size_t source_size, work_items; // OpenCL runtime configuration unsigned num_devices; cl_platform_id platform_ids[3]; cl_uint ret_num_platforms; cl_device_id device_id; cl_context context = NULL; cl_command_queue command_queue; cl_program program = NULL; cl_int ret; cl_kernel kernelINIT; cl_event kernel_event, finish_event; cl_mem objPARTICULAS; // Abrimos el fichero que contiene el kernel fopen_s(&kernels, "initparticulasCPU.cl", "r"); if (!kernels) { fprintf(stderr, "Fallo al cargar el kernel\n"); exit(-1); } source_str = (char *) malloc(0x100000); source_size = fread(source_str, 1, 0x100000, kernels); fclose(kernels); // Obtenemos los IDs de las plataformas disponibles if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) { printf("No se puede obtener id de la plataforma"); return -1; } // Intentamos obtener un dispositivo CPU soportado if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) { printf("No se puede obtener id del dispositivo"); return -1; } clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL); // Creación de un contexto OpenCL context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Creación de una cola de comandos command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); // Creación de un programa kernel desde un fichero de código program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: ¡Fallo al construir el programa ejecutable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s", buffer); exit(-1); } // Creación del kernel OpenCL kernelINIT = clCreateKernel(program, "calc_particles_init", &ret); // Creamos el buffer para las partículas y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); objPARTICULAS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(particle), NULL, &ret); const size_t global = 4; const size_t local_work_size = 1; // Transferimos el frame al dispositivo cl_event write_event; ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event); // Establecemos los argumentos del kernel ret = clSetKernelArg(kernelINIT, 0, sizeof(cl_mem), &objPARTICULAS); ret = clSetKernelArg(kernelINIT, 1, sizeof(int), &N); // Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo ret = clEnqueueNDRangeKernel(command_queue, kernelINIT, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event); // Leemos los resultados ret = clEnqueueReadBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 1, &kernel_event, &finish_event); // Esperamos a que termine de leer los resultados clWaitForEvents(1, &finish_event); // Obtenemos el tiempo del kernel y de las transferencias CPU-RAM cl_ulong totalKernel = getStartEndTime(kernel_event); cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event); const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6); printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6); // Liberamos todos los recursos usados (kernels y objetos OpenCL) clReleaseEvent(kernel_event); clReleaseEvent(finish_event); clReleaseEvent(write_event); clReleaseMemObject(objPARTICULAS); clReleaseKernel(kernelINIT); clReleaseCommandQueue(command_queue); clReleaseProgram(program); clReleaseContext(context); }
int main() { size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 }; cl_int err; cl_platform_id platforms[1]; cl_uint nplatforms; cl_device_id devices[1]; // + 1 for duplicate test cl_uint num_devices; cl_program program = NULL; cl_kernel kernelA = NULL; cl_kernel kernelB = NULL; cl_kernel kernelC= NULL; char inputA[] = "A"; char inputB[] = "B"; char inputC[] = "C"; cl_mem inputBufferA = NULL; cl_mem inputBufferB = NULL; cl_mem inputBufferC = NULL; /* command queues */ cl_command_queue queueA = NULL; cl_command_queue queueB = NULL; cl_command_queue queueC = NULL; /* events */ cl_event eventA1 = NULL; cl_event eventB2 = NULL; cl_event eventA3 = NULL; cl_event eventB4 = NULL; /* event wait lists */ cl_event B2_wait_list[1]; cl_event A3_wait_list[1]; cl_event B4_wait_list[1]; cl_event C5_wait_list[2]; err = clGetPlatformIDs(1, platforms, &nplatforms); if (err != CL_SUCCESS && !nplatforms) return EXIT_FAILURE; err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 1, devices, &num_devices); if (err != CL_SUCCESS) return EXIT_FAILURE; cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), devices, NULL); if (err != CL_SUCCESS) { puts("clGetContextInfo call failed\n"); goto error; } queueA = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queueA) { puts("clCreateCommandQueue call failed\n"); goto error; } queueB = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queueB) { puts("clCreateCommandQueue call failed\n"); goto error; } queueC = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queueB) { puts("clCreateCommandQueue call failed\n"); goto error; } inputBufferA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, strlen (inputB)+1, (void *) inputA, &err); if (inputBufferA == NULL) { printf("clCreateBuffer call failed err = %d\n", err); goto error; } inputBufferB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, strlen (inputA)+1, (void *) inputB, &err); if (inputBufferB == NULL) { printf("clCreateBuffer call failed err = %d\n", err); goto error; } inputBufferC = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, strlen (inputA)+1, (void *) inputC, &err); if (inputBufferC == NULL) { printf("clCreateBuffer call failed err = %d\n", err); goto error; } size_t kernel_size = strlen (kernelASourceCode); char* kernel_buffer = kernelASourceCode; program = clCreateProgramWithSource (context, 1, (const char**)&kernel_buffer, &kernel_size, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; err = clBuildProgram (program, num_devices, devices, NULL, NULL, NULL); if (err != CL_SUCCESS) return EXIT_FAILURE; kernelA = clCreateKernel (program, "test_kernel", NULL); if (!kernelA) { puts("clCreateKernel call failed\n"); goto error; } kernelB = clCreateKernel (program, "test_kernel", NULL); if (!kernelB) { puts("clCreateKernel call failed\n"); goto error; } kernelC = clCreateKernel (program, "test_kernel", NULL); if (!kernelC) { puts("clCreateKernel call failed\n"); goto error; } err = clSetKernelArg (kernelA, 0, sizeof (cl_mem), &inputBufferA); if (err) { puts("clSetKernelArg failed\n"); goto error; } err = clSetKernelArg (kernelB, 0, sizeof (cl_mem), &inputBufferB); if (err) { puts("clSetKernelArg failed\n"); goto error; } err = clSetKernelArg (kernelC, 0, sizeof (cl_mem), &inputBufferC); if (err) { puts("clSetKernelArg failed\n"); goto error; } /* first enqueue A1*/ err = clEnqueueNDRangeKernel (queueA, kernelA, 1, NULL, global_work_size, local_work_size, 0, NULL, &eventA1); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } /* enqueue B2 */ B2_wait_list[0] = eventA1; err = clEnqueueNDRangeKernel (queueB, kernelB, 1, NULL, global_work_size, local_work_size, 1, B2_wait_list, &eventB2); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } /* enqueue A3 */ A3_wait_list[0] = eventB2; err = clEnqueueNDRangeKernel (queueA, kernelA, 1, NULL, global_work_size, local_work_size, 1, A3_wait_list, &eventA3); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } /* enqueue B4 */ B4_wait_list[0] = eventA3; err = clEnqueueNDRangeKernel (queueB, kernelB, 1, NULL, global_work_size, local_work_size, 1, B4_wait_list, &eventB4); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } /* enqueue C5 */ C5_wait_list[0] = eventA3; C5_wait_list[1] = eventB4; err = clEnqueueNDRangeKernel (queueC, kernelC, 1, NULL, global_work_size, local_work_size, 2, C5_wait_list, NULL); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } clFinish(queueC); printf("\n"); return EXIT_SUCCESS; error: return EXIT_FAILURE; }
int main() { int i,j,k; // nb of operations: const int dsize = 512; int nthreads = 1; int nbOfAverages = 1e2; int opsMAC = 2; // operations per MAC cl_short4 *in, *out; cl_half *ck; double tops; //total ops #define NQUEUES 1 cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queues[NQUEUES]; cl_mem bufin, bufck, bufout; cl_event event = NULL; cl_program program; cl_kernel kernel; size_t global[2], local[2]; size_t param[5]; char version[300]; // allocate matrices in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in)); out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out)); ck = (cl_half *) calloc(9*9, sizeof(*ck)); in[0].x = 0x3c00; in[1].x = 0x4000; in[dsize].x = 0x4100; ck[0] = 0x3c00; ck[1] = 0x4000; ck[9] = 0x3000; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); for(i = 0; i < NQUEUES; i++) queues[i] = clCreateCommandQueue( ctx, device, 0, &err ); // Print some info about the system clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL); printf("CL_DEVICE_VERSION=%s\n", version); clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL); printf("CL_DRIVER_VERSION=%s\n", version); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]); j = param[0]; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES="); for(i = 0; i < j; i++) printf("%d ", (int)param[i]); printf("\n"); clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); if(!program) { printf("Error creating program\n"); return -1; } err = clBuildProgram(program, 0, 0, 0, 0, 0); if(err != CL_SUCCESS) { char buffer[20000]; size_t len; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); puts(buffer); return -1; } kernel = clCreateKernel(program, "conv9x9", &err); if(!kernel || err != CL_SUCCESS) { printf("Error creating kernel\n"); return -1; } /* Prepare OpenCL memory objects and place matrices inside them. */ cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT}; cl_int rc; bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck), NULL, &err ); size_t origin[3] = {0,0,0}; size_t region[3] = {dsize, dsize, 1}; err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL ); clSetKernelArg(kernel, 0, sizeof(int), &dsize); clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin); clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck); clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout); local[0] = 8; local[1] = 8; global[0] = global[1] = dsize-32; usleep(100000); struct timeval start,end; gettimeofday(&start, NULL); for (k=0; k<nthreads; k++) { //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); for(i=0;i<nbOfAverages;i++) { // do the 2D convolution err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("clEnqueueNDRangeKernel error %d\n", err); return -1; } } } clFinish(queues[0]); gettimeofday(&end, NULL); double t = ((double) (end.tv_sec - start.tv_sec)) + ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified! /* Wait for calculations to be finished. */ /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL ); clFinish(queues[0]); printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x); /* Release OpenCL memory objects. */ clReleaseMemObject( bufin ); clReleaseMemObject( bufck ); clReleaseMemObject( bufout ); /* Release OpenCL working objects. */ for(i = 0; i < NQUEUES; i++) clReleaseCommandQueue( queues[i] ); clReleaseContext( ctx ); // report performance: tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads); printf("\nTime in s: %lf:", t); printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9); printf("\n"); return(0); }
int MemoryOptimizations::setupCL(void) { cl_int status = 0; size_t deviceListSize; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) return SDK_FAILURE; /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetGetContextInfo failed.")) return SDK_FAILURE; /* Get Device specific Information */ /* Get device extensions */ char deviceExtensions[2048]; status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_EXTENSIONS, sizeof(deviceExtensions), deviceExtensions, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo failed.(extensions)")) return SDK_FAILURE; if(!strstr(deviceExtensions, "cl_khr_global_int32_base_atomics")) { sampleCommon->error("Device does not support global_int32_base_atomics!"); return SDK_EXPECTED_FAILURE; } status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed.")) return SDK_FAILURE; status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) return SDK_FAILURE; maxWorkItemSizes = (size_t*)malloc(maxDimensions*sizeof(size_t)); status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)maxWorkItemSizes, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) return SDK_FAILURE; { /* The block is to move the declaration of prop closer to its use */ cl_command_queue_properties prop = 0; prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); if(!sampleCommon->checkVal(status, 0, "clCreateCommandQueue failed.")) return SDK_FAILURE; } /* Input buffer */ inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float4) * length, 0, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateBuffer failed. (inputBuffer)")) return SDK_FAILURE; /* Write data to buffer */ status = clEnqueueWriteBuffer(commandQueue, inputBuffer, 1, 0, sizeof(cl_float4) * length, input, 0, 0, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueWriteBuffer failed. (inputBuffer)")) return SDK_FAILURE; outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * length, 0, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateBuffer failed. (outputBuffer)")) return SDK_FAILURE; /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t *)&binarySize, (const unsigned char**)&binary, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithBinary failed.")) { return SDK_FAILURE; } } else { kernelPath.append("MemoryOptimizations_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file: " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; } /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, &devices[deviceId], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; } /* Copy 1D Fast Path */ kernel[0] = clCreateKernel(program, "copy1DFastPath", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.(copy1DFastPath)")) return SDK_FAILURE; /* Copy 1D Complete Path */ kernel[1] = clCreateKernel(program, "copy1DCompletePath", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (copy1DCompletePath)")) return SDK_FAILURE; /* Copy 2D float */ kernel[2] = clCreateKernel(program, "copy2Dfloat", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (copy2Dfloat)")) return SDK_FAILURE; /* Copy 2D float4 */ kernel[3] = clCreateKernel(program, "copy2Dfloat4", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (copy2Dfloat4)")) return SDK_FAILURE; /* Copy 1D float4 */ kernel[4] = clCreateKernel(program, "copy1Dfloat4", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (copy1Dfloat4)")) return SDK_FAILURE; /* Copy No Coalesced */ kernel[5] = clCreateKernel(program, "NoCoal", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (NoCoal)")) return SDK_FAILURE; /* Copy Split */ kernel[6] = clCreateKernel(program, "Split", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (Split)")) return SDK_FAILURE; return SDK_SUCCESS; }
int CommandGenerate::execute(const std::vector<std::string>& p_args) { if(p_args.size() < 10) { help(); return -1; } unsigned int platformId = atol(p_args[1].c_str()); unsigned int deviceId = atol(p_args[2].c_str()); unsigned int staggerSize = atol(p_args[3].c_str()); unsigned int threadsNumber = atol(p_args[4].c_str()); unsigned int hashesNumber = atol(p_args[5].c_str()); unsigned int nonceSize = PLOT_SIZE * staggerSize; std::cerr << "Threads number: " << threadsNumber << std::endl; std::cerr << "Hashes number: " << hashesNumber << std::endl; unsigned int numjobs = (p_args.size() - 5)/4; std::cerr << numjobs << " plot(s) to do." << std::endl; unsigned int staggerMbSize = staggerSize / 4; std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl; std::vector<std::string> paths(numjobs); std::vector<std::ofstream *> out_files(numjobs); std::vector<unsigned long long> addresses(numjobs); std::vector<unsigned long long> startNonces(numjobs); std::vector<unsigned long long> endNonces(numjobs); std::vector<unsigned int> noncesNumbers(numjobs); std::vector<unsigned char*> buffersCpu(numjobs); std::vector<bool> saving_thread_flags(numjobs); std::vector<std::future<void>> save_threads(numjobs); unsigned long long maxNonceNumber = 0; unsigned long long totalNonces = 0; int returnCode = 0; try { for (unsigned int i = 0; i < numjobs; i++) { std::cerr << "----" << std::endl; std::cerr << "Job number " << i << std::endl; unsigned int argstart = 6 + i*4; paths[i] = std::string(p_args[argstart]); addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10); startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10); noncesNumbers[i] = atol(p_args[argstart+3].c_str()); maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]); totalNonces += noncesNumbers[i]; std::ostringstream outFile; outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \ noncesNumbers[i] << "_" << staggerSize; std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc; out_files[i] = new std::ofstream(outFile.str(), file_mode); assert(out_files[i]); if(noncesNumbers[i] % staggerSize != 0) { noncesNumbers[i] -= noncesNumbers[i] % staggerSize; noncesNumbers[i] += staggerSize; } endNonces[i] = startNonces[i] + noncesNumbers[i]; unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024; std::cerr << "Path: " << outFile.str() << std::endl; std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl; std::cerr << "Creating CPU buffer" << std::endl; buffersCpu[i] = new unsigned char[nonceSize]; if(!buffersCpu[i]) { throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)"); } saving_thread_flags[i] = false; std::cerr << "----" << std::endl; } cl_platform_id platforms[4]; cl_uint platformsNumber; cl_device_id devices[32]; cl_uint devicesNumber; cl_context context = 0; cl_command_queue commandQueue = 0; cl_mem bufferGpuGen = 0; cl_mem bufferGpuScoops = 0; cl_program program = 0; cl_kernel kernelStep1 = 0; cl_kernel kernelStep2 = 0; cl_kernel kernelStep3 = 0; int error; std::cerr << "Retrieving OpenCL platforms" << std::endl; error = clGetPlatformIDs(4, platforms, &platformsNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL platforms"); } if(platformId >= platformsNumber) { throw std::runtime_error("No platform found with the provided id"); } std::cerr << "Retrieving OpenCL GPU devices" << std::endl; error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL devices"); } if(deviceId >= devicesNumber) { throw std::runtime_error("No device found with the provided id"); } std::cerr << "Creating OpenCL context" << std::endl; context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL context"); } std::cerr << "Creating OpenCL command queue" << std::endl; commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL command queue"); } std::cerr << "Creating OpenCL GPU generation buffer" << std::endl; bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer"); } std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl; bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer"); } std::cerr << "Creating OpenCL program" << std::endl; std::string source = loadSource("kernel/nonce.cl"); const char* sources[] = {source.c_str()}; size_t sourcesLength[] = {source.length()}; program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL program"); } std::cerr << "Building OpenCL program" << std::endl; error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0); if(error != CL_SUCCESS) { size_t logSize; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize); char* log = new char[logSize]; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0); std::cerr << log << std::endl; delete[] log; throw OpenclError(error, "Unable to build the OpenCL program"); } std::cerr << "Creating OpenCL step1 kernel" << std::endl; kernelStep1 = clCreateKernel(program, "nonce_step1", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step2 kernel" << std::endl; kernelStep2 = clCreateKernel(program, "nonce_step2", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step3 kernel" << std::endl; kernelStep3 = clCreateKernel(program, "nonce_step3", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize); error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen); error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } size_t globalWorkSize = staggerSize; size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber; time_t startTime = time(0); unsigned int totalNoncesCompleted = 0; for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) { for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) { unsigned long long nonce = startNonces[jobnum] + nonce_ordinal; if (nonce > endNonces[jobnum]) { break; } std::cout << "Running with start nonce " << nonce << std::endl; // Is a cl_ulong always an unsigned long long? unsigned int error = 0; error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step1 kernel launch"); } unsigned int hashesSize = hashesNumber * HASH_SIZE; for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) { error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce); error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset); error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel launch"); } error = clFinish(commandQueue); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel finish"); } } totalNoncesCompleted += staggerSize; double percent = 100.0 * (double)totalNoncesCompleted / totalNonces; time_t currentTime = time(0); double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0; double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed; std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s"; std::cerr << "... "; error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step3 kernel launch"); } if (saving_thread_flags[jobnum]) { save_threads[jobnum].wait(); // Wait for last job to finish saving_thread_flags[jobnum] = false; } error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in synchronous read"); } saving_thread_flags[jobnum] = true; save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]); } } //Clean up for (unsigned int i = 0; i < paths.size(); i += 1) { if (saving_thread_flags[i]) { std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl; save_threads[i].wait(); saving_thread_flags[i] = false; std::cerr << "done waiting for final save" << std::endl; if (buffersCpu[i]) { delete[] buffersCpu[i]; } } } if(kernelStep3) { clReleaseKernel(kernelStep3); } if(kernelStep2) { clReleaseKernel(kernelStep2); } if(kernelStep1) { clReleaseKernel(kernelStep1); } if(program) { clReleaseProgram(program); } if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); } if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); } if(commandQueue) { clReleaseCommandQueue(commandQueue); } if(context) { clReleaseContext(context); } time_t currentTime = time(0); double elapsedTime = difftime(currentTime, startTime) / 60.0; double speed = (double)totalNonces / elapsedTime; std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s"; std::cerr << " " << std::endl; } catch(const OpenclError& ex) { std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl; returnCode = -1; } catch(const std::exception& ex) { std::cerr << "[ERROR] " << ex.what() << std::endl; returnCode = -1; } return returnCode; }
int main(int argc, char* argv[]) { struct pb_Parameters *parameters; parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } struct pb_TimerSet timers; char oclOverhead[] = "OCL Overhead"; char intermediates[] = "IntermediatesKernel"; char finals[] = "FinalKernel"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } unsigned int img_width, img_height; unsigned int histo_width, histo_height; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } cl_int ciErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(parameters); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram[2]; cl_kernel histo_intermediates_kernel; cl_kernel histo_final_kernel; cl_mem input; cl_mem ranges; cl_mem sm_mappings; cl_mem global_subhisto; cl_mem global_overflow; cl_mem final_histo; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); cl_uint workItemDimensions; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) ); size_t workItemSizes[workItemDimensions]; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) ); size_t program_length[2]; const char *source_path[2] = { "src/opencl_mxpa/histo_intermediates.cl", "src/opencl_mxpa/histo_final.cl"}; char *source[4]; for (int i = 0; i < 2; ++i) { // Dynamically allocate buffer for source source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]); if(!source[i]) { fprintf(stderr, "Could not load program source\n"); exit(1); } clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source[i]); } for (int i = 0; i < 2; ++i) { //fprintf(stderr, "Building Program #%d...\n", i); OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) ); /* char *build_log; size_t ret_val_size; ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); OCL_ERRCK_VAR(ciErrNum); build_log = (char *)malloc(ret_val_size+1); ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); OCL_ERRCK_VAR(ciErrNum); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); */ } histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); // Must dynamically allocate. Too large for stack unsigned int *zeroData; zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int)); if (zeroData == NULL) { fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height); exit(1); } for (int y=0; y < img_height; y++){ OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, y*img_width*sizeof(unsigned int), // Offset in bytes img_width*sizeof(unsigned int), // Size of data to write &img[y*img_width], // Host Source 0, NULL, NULL) ); } pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); unsigned int img_dim = img_height*img_width; OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) ); size_t inter_localWS[1] = { workItemSizes[0] }; size_t inter_globalWS[1] = { img_height * inter_localWS[0] }; size_t final_localWS[1] = { workItemSizes[0] }; size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) / (int)final_localWS[0])*(int)final_localWS[0] }; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int iter = 0; iter < numIterations; iter++) { unsigned int ranges_h[2] = {UINT32_MAX, 0}; // how about something like // __global__ unsigned int ranges[2]; // ...kernel // __shared__ unsigned int s_ranges[2]; // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];} // __syncthreads(); // Although then removing the blocking cudaMemcpy's might cause something about // concurrent kernel execution. // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads? OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to write ranges_h, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 0, // Offset in bytes histo_width*histo_height*sizeof(unsigned int), // Size of data to write zeroData, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0, inter_globalWS, inter_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0, final_globalWS, final_localWS, 0, 0, 0) ); } pb_SwitchToTimer(&timers, pb_TimerID_IO); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 0, // Offset in bytes histo_height*histo_width*sizeof(unsigned char), // Size of data to read histo, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(input) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) ); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(zeroData); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); pb_DestroyTimerSet(&timers); OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL ( clReleaseContext(clContext) ); return 0; }
int32_t init_kernel_platform() { cl_uint plat_num; cl_platform_id plat_id = NULL; cl_uint dev_num = 0; cl_device_id *devices; ret = clGetPlatformIDs(0, NULL, &plat_num); if (ret < 0) { LOGD("MU1 Error: Getting plat_ids!\n"); return -1; } if(plat_num > 0) { cl_platform_id* plat_ids = (cl_platform_id* )malloc(plat_num* sizeof(cl_platform_id)); ret = clGetPlatformIDs(plat_num, plat_ids, NULL); plat_id = plat_ids[0]; free(plat_ids); } ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, 0, NULL, &dev_num); if (dev_num == 0) { LOGD("MU1: No GPU device available.\n"); LOGD("MU1: Choose CPU as default device.\n"); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, 0, NULL, &dev_num); devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id)); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, dev_num, devices, NULL); } else { LOGD("MU1: Choose GPU as default device. dev_num %d\n", dev_num); devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id)); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, dev_num, devices, NULL); } context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); char filename[] = "/data/mu1_kernel.cl"; char file_context[10*1024]={0}; const char *source = &file_context[0]; ret = read_cl(filename, &file_context[0]); size_t sourceSize[10] = {strlen(source)}; cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize[0], NULL); ret = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(ret < 0) { LOGD("MU1 Error: clBuildProgram error\n"); return 0; } kernel = clCreateKernel(program, "process_iq", NULL); inputBuffer_i = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_i[0][0]), NULL); inputBuffer_q = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_q[0][0]), NULL); inputBuffer_o = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_o[0][0]), NULL); ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer_i); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer_q); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&inputBuffer_o); if(devices != NULL) { free(devices);} LOGD("MU1: init cl plat success"); return 0; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'add_sat_ulong8ulong8.cl' */ source_code = read_buffer("add_sat_ulong8ulong8.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "add_sat_ulong8ulong8", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_ulong8 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong8)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_ulong8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong8), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_ulong8 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_ulong8)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_ulong8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong8), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_ulong8 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_ulong8)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_ulong8)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_ulong8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_ulong8), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_ulong8)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
void initOpenCL(void) { /* get the platform(s) */ cl_uint num_platforms; clError = clGetPlatformIDs( 0, NULL, &num_platforms ); checkErr (clError, "clGetPlatformIDs( 0, NULL, &num_platforms );"); if (num_platforms <= 0) { std::cerr << "No platform..." << std::endl; exit(1); } cl_platform_id* platforms = new cl_platform_id[num_platforms]; clError = clGetPlatformIDs(num_platforms, platforms, NULL); checkErr(clError, "clGetPlatformIDs( num_platforms, &platforms, NULL );"); platform_id = platforms[0]; delete platforms; /* Connect to a compute device */ err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 1, &device, NULL); checkErr(err,"Failed to create a device group"); /* Create a compute context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); checkErr(err,"Failed to create a compute context!"); /* Create a command queue */ commandQueue = clCreateCommandQueue(context, device, 0, &err); checkErr(err,"Failed to create a command commands!"); /* Open Kernel file */ std::string filename = "/home/ictp17/bagus/kernels.cl"; std::ifstream kernelFile(filename.c_str(), std::ios::in); if (not kernelFile.is_open()) { std::cout << "Unable to open " << filename << ". " << std::endl; exit(1); } /* * Read the kernel file into an output stream. * Convert this into a char array for passing to OpenCL. */ std::ostringstream outputStringStream; outputStringStream << kernelFile.rdbuf(); std::string srcStdStr = outputStringStream.str(); const char* charSource = srcStdStr.c_str(); kernelFile.close(); /* Create the compute program from the source buffer */ program = clCreateProgramWithSource(context, 1, (const char **) &charSource, NULL, &err); if (not program) { std::cerr << "Error: Failed to create compute program!" << std::endl; exit(1); } /* Build the program executable */ const char* flags = ""; err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[1024]; std::cerr << "Error: Failed to build program executable!" << std::endl; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); std::cerr << buffer << std::endl; exit(1); } /* Create Kernels objects for all the kernels in the OpenCL program */ cl_uint num_kernels; kernel = clCreateKernel(program, "vector_sum", &err); if (err != CL_SUCCESS) { size_t len; char buffer[1024]; std::cout << "Error: Failed to create kernels in program!" << std::endl; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); std::cout << buffer << std::endl; exit(1); } }
short opencl_load_kernel(struct openclenv* env, const char *bitcode_path, const char *kernname, size_t index) { char *err_msg = NULL; int err = 0, log_error; size_t pathlen = strlen(bitcode_path); char *build_log; size_t build_log_size; char build_opts[MAX_BUILD_OPTS_LENGTH]; #ifdef __APPLE__ env->program = clCreateProgramWithSource(env->context, 1, (const char **)&bitcode_path, &pathlen, &err); #else size_t code_length; char *code_contents = _read_file(bitcode_path, &code_length); if (!code_contents) { err_msg = strerror(errno); err = errno; goto error; } env->program = clCreateProgramWithSource(env->context, 1 , (const char **)&code_contents, &code_length, &err); free(code_contents); #endif if (err) { err_msg = "clCreateProgramWithSource"; goto error; } _opencl_generate_build_opts(build_opts, MAX_BUILD_OPTS_LENGTH); stablecl_log(log_message, "Building kernel %s from %s...", kernname, bitcode_path); stablecl_log(log_message, "Build options: %s", build_opts); err = clBuildProgram(env->program, 1, &env->device, build_opts, NULL, NULL); #ifndef SIMULATOR_BUILD log_error = clGetProgramBuildInfo(env->program, env->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size); if (log_error) stablecl_log(log_err, "Error retrieving build log size."); else { build_log = malloc(build_log_size); if (!build_log) stablecl_log(log_err, "Couldn't allocate enough memory for build log."); else { log_error = clGetProgramBuildInfo(env->program, env->device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, &build_log_size); if (log_error) stablecl_log(log_err, "Couldn't get build log: %s", opencl_strerr(log_err)); else if (err) stablecl_log(log_err, "Build log (size %zu):\n%s", build_log_size, build_log); free(build_log); } } #endif if (err) { err_msg = "clBuildProgram"; goto error; } env->kernel[index] = clCreateKernel(env->program, kernname, &err); if (err) { err_msg = "clCreateKernel"; goto error; } _opencl_kernel_info(env, env->kernel[index]); env->enabled_kernels[index] = 1; env->kernel_count++; stablecl_log(log_message, "Kernel %s loaded successfully", kernname); error: if (err && err_msg) stablecl_log(log_err, "Kernel load failed with error %d at %s: %s", err, err_msg, opencl_strerr(err)); return err; }
static ConvolveInfo *GetConvolveInfo(const Image *image,const char *name, const char *source,ExceptionInfo *exception) { char options[MaxTextExtent]; cl_context_properties context_properties[3]; cl_int status; cl_platform_id platforms[1]; cl_uint number_platforms; ConvolveInfo *convolve_info; size_t length, lengths[] = { strlen(source) }; /* Create OpenCL info. */ convolve_info=(ConvolveInfo *) AcquireMagickMemory(sizeof(*convolve_info)); if (convolve_info == (ConvolveInfo *) NULL) { (void) ThrowMagickException(exception,GetMagickModule(), ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename); return((ConvolveInfo *) NULL); } (void) ResetMagickMemory(convolve_info,0,sizeof(*convolve_info)); /* Create OpenCL context. */ status=clGetPlatformIDs(0,(cl_platform_id *) NULL,&number_platforms); if ((status == CL_SUCCESS) && (number_platforms > 0)) status=clGetPlatformIDs(1,platforms,NULL); if (status != CL_SUCCESS) { (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning, "failed to create OpenCL context","`%s' (%d)",image->filename,status); convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } context_properties[0]=CL_CONTEXT_PLATFORM; context_properties[1]=(cl_context_properties) platforms[0]; context_properties[2]=0; convolve_info->context=clCreateContextFromType(context_properties, (cl_device_type) CL_DEVICE_TYPE_GPU,ConvolveNotify,exception,&status); if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS)) convolve_info->context=clCreateContextFromType(context_properties, (cl_device_type) CL_DEVICE_TYPE_CPU,ConvolveNotify,exception,&status); if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS)) convolve_info->context=clCreateContextFromType(context_properties, (cl_device_type) CL_DEVICE_TYPE_DEFAULT,ConvolveNotify,exception,&status); if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS)) { (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning, "failed to create OpenCL context","`%s' (%d)",image->filename,status); convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } /* Detect OpenCL devices. */ status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,0,NULL, &length); if ((status != CL_SUCCESS) || (length == 0)) { convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } convolve_info->devices=(cl_device_id *) AcquireMagickMemory(length); if (convolve_info->devices == (cl_device_id *) NULL) { (void) ThrowMagickException(exception,GetMagickModule(), ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename); convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,length, convolve_info->devices,NULL); if (status != CL_SUCCESS) { convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } if (image->debug != MagickFalse) { char attribute[MaxTextExtent]; size_t length; clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_NAME, sizeof(attribute),attribute,&length); (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Name: %s", attribute); clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_VENDOR, sizeof(attribute),attribute,&length); (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Vendor: %s", attribute); clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_VERSION, sizeof(attribute),attribute,&length); (void) LogMagickEvent(AccelerateEvent,GetMagickModule(), "Driver Version: %s",attribute); clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_PROFILE, sizeof(attribute),attribute,&length); (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Profile: %s", attribute); clGetDeviceInfo(convolve_info->devices[0],CL_DRIVER_VERSION, sizeof(attribute),attribute,&length); (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Driver: %s", attribute); clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_EXTENSIONS, sizeof(attribute),attribute,&length); (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Extensions: %s", attribute); } /* Create OpenCL command queue. */ convolve_info->command_queue=clCreateCommandQueue(convolve_info->context, convolve_info->devices[0],0,&status); if ((convolve_info->command_queue == (cl_command_queue) NULL) || (status != CL_SUCCESS)) { convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } /* Build OpenCL program. */ convolve_info->program=clCreateProgramWithSource(convolve_info->context,1, &source,lengths,&status); if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS)) { convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } (void) FormatLocaleString(options,MaxTextExtent,CLOptions,(float) QuantumRange,MagickEpsilon); status=clBuildProgram(convolve_info->program,1,convolve_info->devices,options, NULL,NULL); if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS)) { char *log; status=clGetProgramBuildInfo(convolve_info->program, convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&length); log=(char *) AcquireMagickMemory(length); if (log == (char *) NULL) { convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } status=clGetProgramBuildInfo(convolve_info->program, convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,length,log,&length); (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning, "failed to build OpenCL program","`%s' (%s)",image->filename,log); log=DestroyString(log); convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } /* Get a kernel object. */ convolve_info->kernel=clCreateKernel(convolve_info->program,name,&status); if ((convolve_info->kernel == (cl_kernel) NULL) || (status != CL_SUCCESS)) { convolve_info=DestroyConvolveInfo(convolve_info); return((ConvolveInfo *) NULL); } return(convolve_info); }