int main(int argc, char** argv) { int err; // error code returned from api calls cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem in_array; // device memory used for the input array //cl_mem synaptic_weights; // device memory used for the input array cl_mem out_array; // device memory used for the output array if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return -1; } //float in_array[NO_NODES]; //float out_array[NO_NODES]; //float synaptic_weights[NO_NODES*NO_NODES]; float in_array_tb[NO_NODES]; float out_array_tb[NO_NODES]; //float synaptic_weights_tb[NO_NODES*NO_NODES]; float temp =0; int i = 0; int j = 0; int index = 0; FILE* ifp; char* mode = "r"; // // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return -1; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return -1; } // // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return -1; } //relu_1(in_array,synaptic_weights,out_array); // Fill our data sets with pattern // //int i = 0; //for(i = 0; i < DATA_SIZE; i++) { // a[i] = (int)i; // b[i] = (int)i; // results[i] = 0; //} // // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return -1; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return -1; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); printf("err : %d %s\n",err,err); } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return -1; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "relu_1", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return -1; } // Create the input and output arrays in device memory for our calculation // in_array = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES, NULL, NULL); //synaptic_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES * NO_NODES, NULL, NULL); out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL); if (!in_array || /*!synaptic_weights ||*/ !out_array) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return -1; } ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode); if(ifp == NULL) { printf("Input file not found \n"); return -1; } while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) { in_array_tb[index++] = temp; } index = 0; temp = 0; //ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode); //if(ifp == NULL) //{ // printf("Weight file not found \n"); // return -1; //} //while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) { // synaptic_weights_tb[index++] = temp; //} // // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return -1; } // Write our data set into the input array in device memory // //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) * NO_NODES * NO_NODES, synaptic_weights_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array b!\n"); printf("Test failed\n"); return -1; } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array); //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return -1; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // err = clEnqueueTask(commands, kernel, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return -1; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return -1; } clWaitForEvents(1, &readevent); //printf("A\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",a[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("B\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",b[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("res\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // Validate our results // //correct = 0; //for(i = 0; i < DATA_SIZE; i++) //{ // int row = i/MATRIX_RANK; // int col = i%MATRIX_RANK; // int running = 0; // int index; // for (index=0;index<MATRIX_RANK;index++) { // int aIndex = row*MATRIX_RANK + index; // int bIndex = col + index*MATRIX_RANK; // running += a[aIndex] * b[bIndex]; // } // sw_results[i] = running; //} // //for (i = 0;i < DATA_SIZE; i++) // if(results[i] == sw_results[i]) // correct++; //printf("Software\n"); //for (i=0;i<DATA_SIZE;i++) { // //printf("%0.2f ",sw_results[i]); // printf("%d ",sw_results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // // //// Print a brief summary detailing the results //// //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE); // // Shutdown and cleanup int temp_ = 0; for (j = 0; j < NO_NODES; j++) { if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0) { //printf("out_array[%d] = %f \n", j, out_array[j]); temp_++; } } clReleaseMemObject(in_array); //clReleaseMemObject(synaptic_weights); clReleaseMemObject(out_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); if (temp_ == NO_NODES) { printf("*********************************************************** \n"); printf("TEST PASSED !!!!!! The output matches the desired output. \n"); printf("*********************************************************** \n"); return EXIT_SUCCESS; } else { printf("**************************************************************** \n"); printf("TEST Failed !!!!!! The output does not match the desired output. \n"); printf("**************************************************************** \n"); return -1; } //if(correct == DATA_SIZE){ // printf("Test passed!\n"); // return EXIT_SUCCESS; //} //else{ // printf("Test failed\n"); // return -1; //} }
int buildProgramFromAmdBin(unsigned int platform_id,unsigned int dev_id,char *binFile) { int i = 0; cl_int err = CL_SUCCESS; cl_int nPlatforms = 0; cl_platform_id *platforms = NULL; cl_platform_id platform = (cl_platform_id)NULL; cl_context_properties cprops[3]; cl_context context; size_t nDevices = 0; cl_device_id devices[MAXGPUS]; cl_device_id device_id = 0; size_t binary_size = 0; char * binary = NULL; cl_program program = NULL; char pbuf[100]; cl_command_queue cmdq; cl_mem iBuf,oBuf; cl_kernel kernel; cl_int *inBuf,*outBuf; inBuf=(cl_int*)malloc(MAX_THREADS*sizeof(cl_int)); outBuf=(cl_int*)malloc(MAX_THREADS*sizeof(cl_int)); size_t N=MAX_THREADS; cl_event evnt; char buildOptions[200]; char opencl_log[1024*64]; /* figure out the number of platforms on this system. */ err = clGetPlatformIDs( 0, NULL, &nPlatforms ); checkErr( "clGetPlatformIDs", err ); printf( "Number of platforms found: %d\n", nPlatforms ); if( nPlatforms == 0 ) { fprintf( stderr, "Cannot continue without any platforms. Exiting.\n" ); return( -1 ); } platforms = (cl_platform_id *)malloc( sizeof(cl_platform_id) * nPlatforms ); err = clGetPlatformIDs( nPlatforms, platforms, NULL ); checkErr( "clGetPlatformIDs", err ); /* Check for AMD platform. */ err = clGetPlatformInfo( platforms[platform_id], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL ); checkErr( "clGetPlatformInfo", err ); if( strcmp(pbuf, "Advanced Micro Devices, Inc.") == 0 ) { printf( "Found AMD platform\n" ); platform = platforms[platform_id]; } if( platform == (cl_context_properties)0 ) { fprintf( stderr, "Could not find an AMD platform. Exiting.\n" ); exit(0); } clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL,MAXGPUS, devices, &nDevices); cprops[0] = CL_CONTEXT_PLATFORM; cprops[1] = (cl_context_properties)platform; cprops[2] = (cl_context_properties)NULL; context = clCreateContext(cprops, 1, &devices[dev_id], NULL, NULL, &err); checkErr( "clCreateContext", err ); printDeviceName(dev_id,devices[dev_id]); /* read in the binary kernel. */ binary = readKernelBin( &binary_size, binFile ); /* create an OpenCL program from the binary kernel. */ program = clCreateProgramWithBinary( context, 1, &devices[dev_id], &binary_size, (const unsigned char**)&binary, NULL, &err ); checkErr( "clCreateProgramWithBinary", err ); sprintf(buildOptions,"%s %s",OCL_BINARY_OPTIONS ,OCL_OPTIMIZATIONS); /* build the kernel source for all available devices in the context. */ err = clBuildProgram( program, 0, NULL,buildOptions , NULL, NULL ); checkErr("clGetProgramBuildInfo",clGetProgramBuildInfo(program, devices[dev_id], CL_PROGRAM_BUILD_LOG, sizeof(opencl_log), (void *) opencl_log, NULL)); /*Report build errors and warnings*/ if (err != CL_SUCCESS) { fprintf(stderr, "Compilation log: %s\n", opencl_log); exit(0); } #ifdef REPORT_OPENCL_WARNINGS else if (strlen(opencl_log) > 1) fprintf(stderr, "Compilation log: %s\n", opencl_log); #endif /* IT IS APPLICATION-DEPENDENT WHAT TO DO AFTER THIS POINT. */ printf( "\n*** REPLACE THIS WITH ACTUAL WORK ***\n" ); for(i=0; i<MAX_THREADS; i++) inBuf[i]=i; kernel=clCreateKernel(program,"test",&err) ; if(err) { printf("Create Kernel test FAILED\n"); return 0; } cmdq=clCreateCommandQueue(context, devices[dev_id], CL_QUEUE_PROFILING_ENABLE,&err); checkErr("Create CMDQ FAILED\n",err); iBuf=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,MAX_THREADS*sizeof(cl_int),inBuf,&err); if((iBuf==(cl_mem)0)) { checkErr("Create Buffer FAILED\n",err); } oBuf=clCreateBuffer(context,CL_MEM_WRITE_ONLY,MAX_THREADS*sizeof(cl_int),NULL,&err); if((oBuf==(cl_mem)0)) { checkErr("Create Buffer FAILED\n",err); } checkErr("Set Kernel Arg FAILED arg0\n",clSetKernelArg(kernel,0,sizeof(cl_mem),&iBuf)); checkErr("Set Kernel Arg FAILED arg1\n",clSetKernelArg(kernel,1,sizeof(cl_mem),&oBuf)); err=clEnqueueNDRangeKernel(cmdq,kernel,1,NULL,&N,NULL,0,NULL,&evnt); clWaitForEvents(1,&evnt); checkErr("Write FAILED\n",clEnqueueReadBuffer(cmdq,oBuf,CL_TRUE,0,MAX_THREADS*sizeof(cl_uint),outBuf, 0, NULL, NULL)); for(i=0; i<MAX_THREADS; i++) printf("%d\n",outBuf[i]); return (0); }
int SimpleImage::setupCL() { cl_int status = CL_SUCCESS; #if 0 cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } size_t deviceListSize; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; /* Check for image support */ status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo failed.")) return SDK_FAILURE; if(!imageSupport) { std::cout << "Error : Images are not supported on this device!\n"; return SDK_EXPECTED_FAILURE; } /* Create command queue */ cl_command_queue_properties prop = 0; if(timing) prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateCommandQueue failed.")) { return SDK_FAILURE; } /* * Create and initialize image objects */ /* Create 2D input image */ inputImage2D = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, width, height, 0, inputImageData, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (inputImageBuffer)")) { return SDK_FAILURE; } /* Create 2D output image */ outputImage2D = clCreateImage2D(context, CL_MEM_WRITE_ONLY, &imageFormat, width, height, 0, 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (inputImageBuffer)")) { return SDK_FAILURE; } /* Create 3D input image */ inputImage3D = clCreateImage3D(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, width, height / 2, 2, //2 slices 0, 0, inputImageData, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (inputImageBuffer)")) { return SDK_FAILURE; } /* Writes to 3D images not allowed in spec currently */ outputImage3D = clCreateImage2D(context, CL_MEM_WRITE_ONLY, &imageFormat, width, height, 0, 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (inputImageBuffer)")) { return SDK_FAILURE; } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateBuffer failed. (outputImageBuffer)")) { return SDK_FAILURE; } /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t *)&binarySize, (const unsigned char**)&binary, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithBinary failed.")) { return SDK_FAILURE; } } else { kernelPath.append("SimpleImage_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : "<< kernelPath << std::endl; return SDK_FAILURE; } const char *source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; } /* create a cl program executable for all the devices specified */ status = clBuildProgram( program, 1, &devices[deviceId], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; } /* get a kernel object handle for a kernel with the given name */ kernel2D = clCreateKernel(program, "image2dCopy", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) { return SDK_FAILURE; } kernel3D = clCreateKernel(program, "image3dCopy", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) { return SDK_FAILURE; } /* Check group size against group size returned by kernel */ status = clGetKernelWorkGroupInfo(kernel2D, devices[deviceId], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel2DWorkGroupSize, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetKernelWorkGroupInfo failed.")) { return SDK_FAILURE; } /* Check group size against group size returned by kernel */ status = clGetKernelWorkGroupInfo(kernel3D, devices[deviceId], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel3DWorkGroupSize, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetKernelWorkGroupInfo failed.")) { return SDK_FAILURE; } cl_uint temp = (cl_uint)min(kernel2DWorkGroupSize, kernel3DWorkGroupSize); if((blockSizeX * blockSizeY) > temp) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel(s) : " << temp << std::endl; std::cout << "Falling back to " << temp << std::endl; } if(blockSizeX > temp) { blockSizeX = temp; blockSizeY = 1; } } #endif return SDK_SUCCESS; }
_clState *initCl(unsigned int gpu, char *name, size_t nameSize) { _clState *clState = calloc(1, sizeof(_clState)); bool patchbfi = false, prog_built = false; cl_platform_id platform = NULL; char pbuff[256], vbuff[255]; cl_platform_id* platforms; cl_device_id *devices; cl_uint numPlatforms; cl_uint numDevices; cl_int status; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)"); return NULL; } platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)"); return NULL; } if (opt_platform_id >= (int)numPlatforms) { applog(LOG_ERR, "Specified platform that does not exist"); return NULL; } status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)"); return NULL; } platform = platforms[opt_platform_id]; if (platform == NULL) { perror("NULL platform found!\n"); return NULL; } applog(LOG_INFO, "CL Platform vendor: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform name: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform version: %s", vbuff); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device IDs (num)"); return NULL; } if (numDevices > 0 ) { devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device IDs (list)"); return NULL; } applog(LOG_INFO, "List of devices:"); unsigned int i; for (i = 0; i < numDevices; i++) { status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device Info"); return NULL; } applog(LOG_INFO, "\t%i\t%s", i, pbuff); } if (gpu < numDevices) { status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device Info"); return NULL; } applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); strncpy(name, pbuff, nameSize); } else { applog(LOG_ERR, "Invalid GPU %i", gpu); return NULL; } } else return NULL; cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)"); return NULL; } /* Check for BFI INT support. Hopefully people don't mix devices with * and without it! */ char * extensions = malloc(1024); const char * camo = "cl_amd_media_ops"; char *find; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS"); return NULL; } find = strstr(extensions, camo); if (find) clState->hasBitAlign = true; /* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */ char * devoclver = malloc(1024); const char * ocl10 = "OpenCL 1.0"; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION"); return NULL; } find = strstr(devoclver, ocl10); if (!find) clState->hasOpenCL11plus = true; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT"); return NULL; } applog(LOG_DEBUG, "Preferred vector width reported %d", clState->preferred_vwidth); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE"); return NULL; } applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size); /* For some reason 2 vectors is still better even if the card says * otherwise, and many cards lie about their max so use 256 as max * unless explicitly set on the command line. 79x0 cards perform * better without vectors */ if (clState->preferred_vwidth > 1) { if (strstr(name, "Tahiti")) clState->preferred_vwidth = 1; else clState->preferred_vwidth = 2; } if (opt_vectors) clState->preferred_vwidth = opt_vectors; if (opt_worksize && opt_worksize <= (int)clState->max_work_size) clState->work_size = opt_worksize; else clState->work_size = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->preferred_vwidth; /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would * have otherwise created. The filename is: * name + kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin */ char binaryfilename[255]; char filename[255]; char numbuf[10]; if (chosen_kernel == KL_NONE) { if (strstr(name, "Tahiti") // GCN || !clState->hasBitAlign // Older Radeon & Nvidia || strstr(vbuff, "844.4") // Linux 64 bit ATI 2.6 SDK || strstr(vbuff, "851.4") // Windows 64 bit "" || strstr(vbuff, "831.4") // Windows & Linux 32 bit "" ) clState->chosen_kernel = KL_POCLBM; else clState->chosen_kernel = KL_PHATK; } else clState->chosen_kernel = chosen_kernel; switch (clState->chosen_kernel) { case KL_POCLBM: strcpy(filename, POCLBM_KERNNAME".cl"); strcpy(binaryfilename, POCLBM_KERNNAME); break; case KL_NONE: /* Shouldn't happen */ case KL_PHATK: strcpy(filename, PHATK_KERNNAME".cl"); strcpy(binaryfilename, PHATK_KERNNAME); break; case KL_DIAKGCN: strcpy(filename, DIAKGCN_KERNNAME".cl"); strcpy(binaryfilename, DIAKGCN_KERNNAME); break; case KL_DIABLO: strcpy(filename, DIABLO_KERNNAME".cl"); strcpy(binaryfilename, DIABLO_KERNNAME); break; } FILE *binaryfile; size_t *binary_sizes; char **binaries; int pl; char *source = file_contents(filename, &pl); size_t sourceSize[] = {(size_t)pl}; cl_uint slot, cpnd; slot = cpnd = 0; if (!source) return NULL; binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binary_sizes)) { applog(LOG_ERR, "Unable to calloc binary_sizes"); return NULL; } binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binaries)) { applog(LOG_ERR, "Unable to calloc binaries"); return NULL; } strcat(binaryfilename, name); if (clState->hasBitAlign) strcat(binaryfilename, "bitalign"); strcat(binaryfilename, "v"); sprintf(numbuf, "%d", clState->preferred_vwidth); strcat(binaryfilename, numbuf); strcat(binaryfilename, "w"); sprintf(numbuf, "%d", (int)clState->work_size); strcat(binaryfilename, numbuf); strcat(binaryfilename, "long"); sprintf(numbuf, "%d", (int)sizeof(long)); strcat(binaryfilename, numbuf); strcat(binaryfilename, ".bin"); loadbin: binaryfile = fopen(binaryfilename, "rb"); if (!binaryfile) { applog(LOG_DEBUG, "No binary found, generating from source"); } else { struct stat binary_stat; if (unlikely(stat(binaryfilename, &binary_stat))) { applog(LOG_DEBUG, "Unable to stat binary, generating from source"); fclose(binaryfile); goto build; } if (!binary_stat.st_size) goto build; binary_sizes[slot] = binary_stat.st_size; binaries[slot] = (char *)calloc(binary_sizes[slot], 1); if (unlikely(!binaries[slot])) { applog(LOG_ERR, "Unable to calloc binaries"); fclose(binaryfile); return NULL; } if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) { applog(LOG_ERR, "Unable to fread binaries"); fclose(binaryfile); free(binaries[slot]); goto build; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)"); fclose(binaryfile); free(binaries[slot]); goto build; } clRetainProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)"); return NULL; } fclose(binaryfile); applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename); goto built; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// build: /* If no binary is available, and we have a card that suffers with phatk * on SDK2.6, use the poclbm kernel instead if one has not been * selected. */ if (clState->chosen_kernel != KL_POCLBM && chosen_kernel == KL_NONE && !strstr(name, "Tahiti") && clState->hasBitAlign && (strstr(vbuff, "844.4") /* Linux 64 bit ATI 2.6 SDK */ || strstr(vbuff, "851.4") /* Windows 64 bit "" */ || strstr(vbuff, "831.4") /* Windows & Linux 32 bit "" */ )) { applog(LOG_WARNING, "SDK 2.6 detected, using poclbm kernel"); clState->chosen_kernel = KL_POCLBM; strcpy(filename, POCLBM_KERNNAME".cl"); strcpy(binaryfilename, POCLBM_KERNNAME); strcat(binaryfilename, name); strcat(binaryfilename, "bitalign"); strcat(binaryfilename, "v"); sprintf(numbuf, "%d", clState->preferred_vwidth); strcat(binaryfilename, numbuf); strcat(binaryfilename, "w"); sprintf(numbuf, "%d", (int)clState->work_size); strcat(binaryfilename, numbuf); strcat(binaryfilename, "long"); sprintf(numbuf, "%d", (int)sizeof(long)); strcat(binaryfilename, numbuf); strcat(binaryfilename, ".bin"); goto loadbin; } clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithSource)"); return NULL; } clRetainProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)"); return NULL; } /* create a cl program executable for all the devices specified */ char *CompilerOptions = calloc(1, 256); sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d", (int)clState->work_size, clState->preferred_vwidth); applog(LOG_DEBUG, "Setting worksize to %d", clState->work_size); if (clState->preferred_vwidth > 1) applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->preferred_vwidth); if (clState->hasBitAlign) { strcat(CompilerOptions, " -D BITALIGN"); applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN"); if (strstr(name, "Cedar") || strstr(name, "Redwood") || strstr(name, "Juniper") || strstr(name, "Cypress" ) || strstr(name, "Hemlock" ) || strstr(name, "Caicos" ) || strstr(name, "Turks" ) || strstr(name, "Barts" ) || strstr(name, "Cayman" ) || strstr(name, "Antilles" ) || strstr(name, "Wrestler" ) || strstr(name, "Zacate" ) || strstr(name, "WinterPark" ) || strstr(name, "BeaverCreek" )) patchbfi = true; } else applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN"); if (patchbfi) { strcat(CompilerOptions, " -D BFI_INT"); applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT"); } else applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch"); applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions); status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL); free(CompilerOptions); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Building Program (clBuildProgram)"); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_INFO, "%s", log); return NULL; } prog_built = true; status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)"); return NULL; } status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)"); return NULL; } /* The actual compiled binary ends up in a RANDOM slot! Grr, so we have * to iterate over all the binary slots and find where the real program * is. What the heck is this!? */ for (slot = 0; slot < cpnd; slot++) if (binary_sizes[slot]) break; /* copy over all of the generated binaries. */ applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, binary_sizes[slot]); if (!binary_sizes[slot]) { applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!"); return NULL; } binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1); status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)"); return NULL; } /* Patch the kernel if the hardware supports BFI_INT but it needs to * be hacked in */ if (patchbfi) { unsigned remaining = binary_sizes[slot]; char *w = binaries[slot]; unsigned int start, length; /* Find 2nd incidence of .text, and copy the program's * position and length at a fixed offset from that. Then go * back and find the 2nd incidence of \x7ELF (rewind by one * from ELF) and then patch the opcocdes */ if (!advance(&w, &remaining, ".text")) goto build; w++; remaining--; if (!advance(&w, &remaining, ".text")) { /* 32 bit builds only one ELF */ w--; remaining++; } memcpy(&start, w + 285, 4); memcpy(&length, w + 289, 4); w = binaries[slot]; remaining = binary_sizes[slot]; if (!advance(&w, &remaining, "ELF")) goto build; w++; remaining--; if (!advance(&w, &remaining, "ELF")) { /* 32 bit builds only one ELF */ w--; remaining++; } w--; remaining++; w += start; remaining -= start; applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", w, remaining); patch_opcodes(w, length); status = clReleaseProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Releasing program. (clReleaseProgram)"); return NULL; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)"); return NULL; } clRetainProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)"); return NULL; } /* Program needs to be rebuilt */ prog_built = false; } free(source); /* Save the binary to be loaded next time */ binaryfile = fopen(binaryfilename, "wb"); if (!binaryfile) { /* Not a fatal problem, just means we build it again next time */ applog(LOG_DEBUG, "Unable to create file %s", binaryfilename); } else { if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) { applog(LOG_ERR, "Unable to fwrite to binaryfile"); return NULL; } fclose(binaryfile); } built: if (binaries[slot]) free(binaries[slot]); free(binaries); free(binary_sizes); applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d", filename, clState->hasBitAlign ? "" : "out", clState->preferred_vwidth, clState->work_size); if (!prog_built) { /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Building Program (clBuildProgram)"); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_INFO, "%s", log); return NULL; } clRetainProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)"); return NULL; } } /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "search", &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)"); return NULL; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); if (status != CL_SUCCESS) /* Try again without OOE enable */ clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)"); return NULL; } clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)"); return NULL; } return clState; }
void run_delaunay(cl_device_id device, bool is_cpu) { curr_device = device; /* Open Input Points */ scll points = open_points("inputpoints.bin"); int num_points = points->count; /* Create Context */ int error_code; context = clCreateContext(NULL, 1, &device, NULL, NULL, &error_code); if(error_code != 0) { printf("clCreateContext error code = %d\n", error_code); goto ExitFunction; } /* Create Command Queue */ cl_command_queue_properties properties = 0; command_queue = clCreateCommandQueue(context, device, properties, &error_code); if(error_code != 0) { printf("clCreateCommandQueue error code ret=%d\n", error_code); goto ReleaseContext; } char * program_name; if(is_cpu) program_name = "cpu_kernel"; else program_name = "cell_kernel"; /* Open Program */ size_t binary_length; unsigned char * binary; OpenProgramBinary(program_name, &binary_length, &binary); cl_program program; cl_int binary_status; program = clCreateProgramWithBinary(context, 1, &device, &binary_length, (const unsigned char **) &binary, &binary_status, &error_code); if(error_code != 0) { printf("clCreateProgramWithBinary error code = %d\n", error_code); goto ReleaseCommandQueue; } char * kernel_name; if(is_cpu) kernel_name = "InCircle"; else kernel_name = "cell_function"; /* Open Kernel */ kernel = clCreateKernel(program, kernel_name, &error_code); if(error_code != 0) { printf("clCreateKernel error code = %d\n", error_code); goto ReleaseProgram; } /* Create buffers */ points_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, points->alloc_size, NULL, &error_code); if(error_code != 0) { printf("clCreateBuffer (points_mem) error code = %d\n", error_code); goto ReleaseKernel; } scll triangles = delaunay_core(points, num_points, is_cpu); //save_triangles("triangles.txt", triangles); clReleaseMemObject(points_mem); ReleaseKernel: clReleaseKernel(kernel); ReleaseProgram: clReleaseProgram(program); ReleaseCommandQueue: clReleaseCommandQueue(command_queue); ReleaseContext: clReleaseContext(context); ExitFunction: return; }
cl_program piglit_cl_build_program_with_binary_extended(piglit_cl_context context, size_t* lenghts, unsigned char** binaries, const char* options, bool fail) { cl_int errNo; cl_program program; cl_int* binary_status = malloc(sizeof(cl_int) * context->num_devices); program = clCreateProgramWithBinary(context->cl_ctx, context->num_devices, context->device_ids, lenghts, (const unsigned char**)binaries, binary_status, &errNo); if(errNo != CL_SUCCESS) { int i; fprintf(stderr, "Could not create program with binary: %s\n", piglit_cl_get_error_name(errNo)); printf("Create error with binaries:\n"); for(i = 0; i < context->num_devices; i++) { char* device_name = piglit_cl_get_device_info(context->device_ids[i], CL_DEVICE_NAME); printf("Error for %s: %s\n", device_name, piglit_cl_get_error_name(binary_status[i])); free(device_name); } free(binary_status); return NULL; } free(binary_status); errNo = clBuildProgram(program, context->num_devices, context->device_ids, options, NULL, NULL); if( (!fail && errNo != CL_SUCCESS) || ( fail && errNo == CL_SUCCESS)) { int i; fprintf(stderr, !fail ? "Could not build program: %s\n" : "Program built when it should have failed: %s\n", piglit_cl_get_error_name(errNo)); printf("Build log for binaries.\n"); for(i = 0; i < context->num_devices; i++) { char* device_name = piglit_cl_get_device_info(context->device_ids[i], CL_DEVICE_NAME); char* log = piglit_cl_get_program_build_info(program, context->device_ids[i], CL_PROGRAM_BUILD_LOG); printf("Build log for device %s:\n -------- \n%s\n -------- \n", device_name, log); free(device_name); free(log); } clReleaseProgram(program); return NULL; } return program; }
int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs) { unsigned int dev; unsigned int nb_devices; nb_devices = _starpu_opencl_get_device_count(); // Iterate over each device for(dev = 0; dev < nb_devices; dev ++) { cl_device_id device; cl_context context; cl_program program; cl_int err; char *binary; char binary_file_name[1024]; size_t length; cl_int binary_status; opencl_programs->programs[dev] = NULL; starpu_opencl_get_device(dev, &device); starpu_opencl_get_context(dev, &context); if (context == NULL) { _STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev); continue; } // Load the binary buffer err = _starpu_opencl_get_binary_name(binary_file_name, 1024, kernel_id, dev, device); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); binary = _starpu_opencl_load_program_binary(binary_file_name, &length); // Create the compute program from the binary buffer program = clCreateProgramWithBinary(context, 1, &device, &length, (const unsigned char **) &binary, &binary_status, &err); if (!program || err != CL_SUCCESS) { _STARPU_DISP("Error: Failed to load program binary!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Get the status { cl_build_status status; size_t len; static char buffer[4096] = ""; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); if (len > 2) _STARPU_DISP("Compilation output\n%s\n", buffer); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL); if (err != CL_SUCCESS || status != CL_BUILD_SUCCESS) { _STARPU_DISP("Error: Failed to build program executable!\n"); _STARPU_DISP("clBuildProgram: %d - clGetProgramBuildInfo: %d\n", err, status); return EXIT_FAILURE; } } // Store program opencl_programs->programs[dev] = program; } return 0; }
int NBody::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } size_t deviceListSize; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; /* Create command queue */ commandQueue = clCreateCommandQueue( context, devices[deviceId], 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateCommandQueue failed.")) { return SDK_FAILURE; } /* Get Device specific Information */ status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed.")) return SDK_FAILURE; status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDimensions, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) return SDK_FAILURE; maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, (void*)maxWorkItemSizes, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) return SDK_FAILURE; status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void *)&totalLocalMemory, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed.")) return SDK_FAILURE; /* * Create and initialize memory objects */ /* Create memory objects for position */ currPos = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (oldPos)")) { return SDK_FAILURE; } /* Initialize position buffer */ status = clEnqueueWriteBuffer(commandQueue, currPos, 1, 0, numBodies * sizeof(cl_float4), pos, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed. (oldPos)")) { return SDK_FAILURE; } /* Create memory objects for position */ newPos = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (newPos)")) { return SDK_FAILURE; } /* Create memory objects for velocity */ currVel = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (oldVel)")) { return SDK_FAILURE; } /* Initialize velocity buffer */ status = clEnqueueWriteBuffer(commandQueue, currVel, 1, 0, numBodies * sizeof(cl_float4), vel, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed. (oldVel)")) { return SDK_FAILURE; } /* Create memory objects for velocity */ newVel = clCreateBuffer( context, CL_MEM_READ_ONLY, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (newVel)")) { return SDK_FAILURE; } /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t *)&binarySize, (const unsigned char**)&binary, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithBinary failed.")) { return SDK_FAILURE; } } else { kernelPath.append("NBody_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; } /* create a cl program executable for all the devices specified */ status = clBuildProgram( program, 1, &devices[deviceId], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel( program, "nbody_sim", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) { return SDK_FAILURE; } return SDK_SUCCESS; }
int BoxFilterGLSeparable::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } size_t deviceListSize; /* * Have a look at the available platforms and pick either * the AMD one if available or the system default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (CL_SUCCESS != status) { fputs("clGetPlatformIDs() failed", stderr); exit(-1); } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (CL_SUCCESS != status) { fputs("clGetPlatformIDs() failed", stderr); exit(-1); } for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { platform = platforms[i]; break; } } delete platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ #ifdef _WIN32 HGLRC glCtx = wglGetCurrentContext(); #else //!_WIN32 GLXContext glCtx = glXGetCurrentContext(); #endif //!_WIN32 cl_context_properties cpsGL[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, #ifdef _WIN32 CL_WGL_HDC_KHR, (intptr_t)wglGetCurrentDC(), #else //!_WIN32 CL_GLX_DISPLAY_KHR, (intptr_t)glXGetCurrentDisplay(), #endif //!_WIN32 CL_GL_CONTEXT_KHR, (intptr_t)glCtx, 0 }; context = clCreateContextFromType(cpsGL, dType, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; /* Create command queue */ cl_command_queue_properties prop = 0; if(timing) prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateCommandQueue failed.")) { return SDK_FAILURE; } /* * Create texture object */ glGenTextures(1, &tex); glBindTexture(GL_TEXTURE_2D, tex); /* Set parameters */ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindTexture(GL_TEXTURE_2D, 0); /* * Create pixel-buffer object */ glGenBuffers(1, &pbo); glBindBuffer(GL_ARRAY_BUFFER, pbo); // initialize buffer object unsigned int size = width * height * sizeof(cl_uchar4); // buffer data glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); /* Create OpenCL buffer from GL PBO */ outputImageBuffer = clCreateFromGLBuffer(context, CL_MEM_WRITE_ONLY, pbo, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateFromGLBuffer failed. (outputImageBuffer)")) return SDK_FAILURE; /* * Create and initialize memory objects */ /* Create memory object for input Image */ inputImageBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width * height * pixelSize, inputImageData, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (inputImageBuffer)")) { return SDK_FAILURE; } /* Create memory object for temp Image */ tempImageBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE, width * height * pixelSize, 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (tempImageBuffer)")) { return SDK_FAILURE; } /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t *)&binarySize, (const unsigned char**)&binary, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithBinary failed.")) { return SDK_FAILURE; } } else { kernelPath.append("BoxFilterGL_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char *source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; } /* create a cl program executable for all the devices specified */ status = clBuildProgram( program, 1, &devices[deviceId], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory.(buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; } /* get a kernel object handle for a kernel with the given name */ verticalKernel = clCreateKernel(program, "box_filter_vertical", &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateKernel failed. (vertical)")) { return SDK_FAILURE; } #ifdef USE_LDS horizontalKernel = clCreateKernel(program, "box_filter_horizontal_local", &status); #else horizontalKernel = clCreateKernel(program, "box_filter_horizontal", &status); #endif if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateKernel failed. (horizontal)")) { return SDK_FAILURE; } /* Check group size against group size returned by kernel */ status = clGetKernelWorkGroupInfo(verticalKernel, devices[deviceId], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetKernelWorkGroupInfo failed.")) { return SDK_FAILURE; } if((blockSizeX * blockSizeY) > kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelWorkGroupSize << std::endl; } /* Three possible cases */ if(blockSizeX > kernelWorkGroupSize) { blockSizeX = kernelWorkGroupSize; blockSizeY = 1; } } return SDK_SUCCESS; }
struct cl_package initFPGA( const char* xclbin, const char* kernel_name ) { /*****************************************/ /* Initialize OpenCL */ /*****************************************/ // Retrieve the number of platforms cl_uint numPlatforms = 0; cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); //printf("Found %d platforms support OpenCL, return code %d.\n", numPlatforms, status); // Allocate enough space for each platform cl_platform_id *platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) printf("clGetPlatformIDs error(%d)\n", status); // Retrieve the number of devices cl_uint numDevices = 0; #ifndef FPGA_DEVICE status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); #else status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices); #endif printf("Found %d devices support OpenCL.\n", numDevices); // Allocate enough space for each device cl_device_id *devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices #ifndef FPGA_DEVICE status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); #else status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL); #endif if (status != CL_SUCCESS) printf("clGetDeviceIDs error(%d)\n", status); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); if (status != CL_SUCCESS) printf("clCreateContext error(%d)\n", status); //Create a command-queue cl_command_queue clCommandQue = clCreateCommandQueue(context, devices[0], 0, &status); if (status != CL_SUCCESS) printf("clCreateCommandQueue error(%d)\n", status); // 6. Load and build OpenCL kernel #ifndef FPGA_DEVICE // Create a program with source code cl_program program = clCreateProgramWithSource(context, 1, (const char**)&logistic_cl, NULL, &status); if (status != 0) printf("clCreateProgramWithSource error(%d)\n", status); // Build (compile) the program for the device status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); #else // Load binary from disk unsigned char *kernelbinary; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin); exit(1); } size_t n_bit = n_i; // Create the compute program from offline cl_program program = clCreateProgramWithBinary(context, 1, &devices[0], &n_bit, (const unsigned char **) &kernelbinary, NULL, &status); if ((!program) || (status != CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", status); exit(1); } // Build the program executable status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); #endif if (status != 0) { char errmsg[2048]; size_t sizemsg = 0; status = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 2048*sizeof(char), errmsg, &sizemsg); printf("clBuildProgram error(%d)\n", status); printf("Compilation messages: \n %s", errmsg); } cl_kernel clKernel = clCreateKernel(program, kernel_name, &status); if (status != CL_SUCCESS) printf("clCreateKernel error(%d)\n", status); // TODO: parameterize the size of buffers cl_mem d_gradient = clCreateBuffer(context, CL_MEM_READ_WRITE, FEATURE_SIZE*LABEL_SIZE*GROUP_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_gradient clCreateBuffer error(%d)\n", status); cl_mem d_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, FEATURE_SIZE*LABEL_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_weights clCreateBuffer error(%d)\n", status); cl_mem d_data = clCreateBuffer(context, CL_MEM_READ_ONLY, (FEATURE_SIZE+LABEL_SIZE)*CHUNK_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_data clCreateBuffer error(%d)\n", status); struct cl_package result; result.context = context; result.kernel = clKernel; result.commandQueue = clCommandQue; result.d_gradient = d_gradient; result.d_weights = d_weights; result.d_data = d_data; return result; }
pclu_program* pclu_create_program(pclu_context* pclu, const char* path) { int errcode; pclu_program* pgm = (pclu_program*) malloc(sizeof(pclu_program)); pgm->pclu = pclu; pgm->build_log = 0; #define LOAD_BINS 1 #if LOAD_BINS const char* binary = (const char*) pclu_slurp_file("fmma.ptx"); size_t size = strlen(binary); const unsigned char** bins = (const unsigned char**) binary; int status; pgm->program = clCreateProgramWithBinary(pclu->context, 1, &(pclu->device), &size, bins, &status, &errcode); pclu_check_call("clCreateProgramWithBinary", errcode); pclu_check_call("clCreateProgramWithBinary status", status); #else /* Read the source from disk */ char* source = pclu_slurp_file(path); size_t size = strlen(source); const char** sources = (const char**) &source; pgm->program = clCreateProgramWithSource(pclu->context, 1, sources, &size, &errcode); pclu_check_call("clCreateProgramWithSource", errcode); free(source); /* Compile for the device */ errcode = clBuildProgram(pgm->program, 1, &(pclu->device), "", 0, 0); /* Print out errors on failure */ if (errcode == CL_BUILD_PROGRAM_FAILURE) { size_t log_size; char* log_text; pclu_check_call("clGetProgramBuildInfo", clGetProgramBuildInfo( pgm->program, pclu->device, CL_PROGRAM_BUILD_LOG, 0, 0, &log_size)); log_text = (char*) alloca(log_size); pclu_check_call("clGetProgramBuildInfo", clGetProgramBuildInfo( pgm->program, pclu->device, CL_PROGRAM_BUILD_LOG, log_size, log_text, 0)); fprintf(stderr, "Build Errors\n%s\n", log_text); } pclu_check_call("clBuildProgram", errcode); #endif #if DUMP_BINS /* Dump the Binaries */ size_t bin_size; errcode = clGetProgramInfo(pgm->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bin_size, 0); pclu_check_call("clGetProgramInfo(BIN_SIZE)", errcode); cl_uchar* binary = (cl_uchar*) malloc(bin_size); errcode = clGetProgramInfo(pgm->program, CL_PROGRAM_BINARIES, bin_size, &binary, 0); pclu_check_call("clGetProgramInfo(BINARIES)", errcode); FILE* bf = fopen("opencl.bin", "w"); fwrite((void*)binary, bin_size, 1, bf); fclose(bf); free(binary); #endif /* Get the kernels */ /* pclu_check_call("clCreateKernelsInProgram", clCreateKernelsInProgram(pgm->program, 0, 0, &(pgm->num_kernels))); pgm->kernels = (cl_kernel*) malloc(pgm->num_kernels*sizeof(cl_kernel)); pclu_check_call("clCreateKernelsInProgram", clCreateKernelsInProgram(pgm->program, pgm->num_kernels, pgm->kernels, 0)); */ return pgm; }
_clState *initCl(unsigned int gpu, char *name, size_t nameSize) { int patchbfi = 0; cl_int status = 0; unsigned int i; _clState *clState = calloc(1, sizeof(_clState)); cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)"); return NULL; } if (numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)"); return NULL; } for(i = 0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)"); free(platforms); return NULL; } platform = platforms[i]; if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } free(platforms); } if (platform == NULL) { perror("NULL platform found!\n"); return NULL; } size_t nDevices; cl_uint numDevices; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device IDs (num)"); return NULL; } cl_device_id *devices; if (numDevices > 0 ) { devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device IDs (list)"); return NULL; } applog(LOG_INFO, "List of devices:"); unsigned int i; for(i=0; i<numDevices; i++) { char pbuff[100]; status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device Info"); return NULL; } applog(LOG_INFO, "\t%i\t%s", i, pbuff); } if (gpu < numDevices) { char pbuff[100]; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, &nDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device Info"); return NULL; } applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); strncpy(name, pbuff, nameSize); } else { applog(LOG_ERR, "Invalid GPU %i", gpu); return NULL; } } else return NULL; cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)"); return NULL; } /* Check for BFI INT support. Hopefully people don't mix devices with * and without it! */ char * extensions = malloc(1024); const char * camo = "cl_amd_media_ops"; char *find; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS"); return NULL; } find = strstr(extensions, camo); if (find) clState->hasBitAlign = patchbfi = 1; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT"); return NULL; } if (opt_debug) applog(LOG_DEBUG, "Preferred vector width reported %d", clState->preferred_vwidth); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE"); return NULL; } if (opt_debug) applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size); /* For some reason 2 vectors is still better even if the card says * otherwise, and many cards lie about their max so use 256 as max * unless explicitly set on the command line */ if (clState->preferred_vwidth > 1) clState->preferred_vwidth = 2; if (opt_vectors) clState->preferred_vwidth = opt_vectors; if (opt_worksize && opt_worksize <= clState->max_work_size) clState->work_size = opt_worksize; else clState->work_size = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->preferred_vwidth; /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would * have otherwise created. The filename is: * name + kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin */ char binaryfilename[255]; char numbuf[10]; char filename[16]; if (chosen_kernel == KL_NONE) { if (clState->hasBitAlign) chosen_kernel = KL_PHATK; else chosen_kernel = KL_POCLBM; } switch (chosen_kernel) { case KL_POCLBM: strcpy(filename, "poclbm110817.cl"); strcpy(binaryfilename, "poclbm110817"); break; case KL_NONE: /* Shouldn't happen */ case KL_PHATK: strcpy(filename, "phatk110817.cl"); strcpy(binaryfilename, "phatk110817"); break; } FILE *binaryfile; size_t *binary_sizes; char **binaries; int pl; char *source, *rawsource = file_contents(filename, &pl); size_t sourceSize[] = {(size_t)pl}; if (!rawsource) return NULL; source = malloc(pl); if (!source) { applog(LOG_ERR, "Unable to malloc source"); return NULL; } binary_sizes = (size_t *)malloc(sizeof(size_t)*nDevices); if (unlikely(!binary_sizes)) { applog(LOG_ERR, "Unable to malloc binary_sizes"); return NULL; } binaries = (char **)malloc(sizeof(char *)*nDevices); if (unlikely(!binaries)) { applog(LOG_ERR, "Unable to malloc binaries"); return NULL; } strcat(binaryfilename, name); if (clState->hasBitAlign) strcat(binaryfilename, "bitalign"); strcat(binaryfilename, "v"); sprintf(numbuf, "%d", clState->preferred_vwidth); strcat(binaryfilename, numbuf); strcat(binaryfilename, "w"); sprintf(numbuf, "%d", (int)clState->work_size); strcat(binaryfilename, numbuf); strcat(binaryfilename, "long"); sprintf(numbuf, "%d", (int)sizeof(long)); strcat(binaryfilename, numbuf); strcat(binaryfilename, ".bin"); binaryfile = fopen(binaryfilename, "rb"); if (!binaryfile) { if (opt_debug) applog(LOG_DEBUG, "No binary found, generating from source"); } else { struct stat binary_stat; if (unlikely(stat(binaryfilename, &binary_stat))) { if (opt_debug) applog(LOG_DEBUG, "Unable to stat binary, generating from source"); fclose(binaryfile); goto build; } binary_sizes[gpu] = binary_stat.st_size; binaries[gpu] = (char *)malloc(binary_sizes[gpu]); if (unlikely(!binaries[gpu])) { applog(LOG_ERR, "Unable to malloc binaries"); fclose(binaryfile); return NULL; } if (fread(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu]) { applog(LOG_ERR, "Unable to fread binaries[gpu]"); fclose(binaryfile); goto build; } fclose(binaryfile); clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)"); return NULL; } if (opt_debug) applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename); free(binaries[gpu]); goto built; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// build: memcpy(source, rawsource, pl); /* Patch the source file with the preferred_vwidth */ if (clState->preferred_vwidth > 1) { char *find = strstr(source, "VECTORSX"); if (unlikely(!find)) { applog(LOG_ERR, "Unable to find VECTORSX in source"); return NULL; } find += 7; // "VECTORS" if (clState->preferred_vwidth == 2) strncpy(find, "2", 1); else strncpy(find, "4", 1); if (opt_debug) applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->preferred_vwidth); } /* Patch the source file defining BITALIGN */ if (clState->hasBitAlign) { char *find = strstr(source, "BITALIGNX"); if (unlikely(!find)) { applog(LOG_ERR, "Unable to find BITALIGNX in source"); return NULL; } find += 8; // "BITALIGN" strncpy(find, " ", 1); if (opt_debug) applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BITALIGN"); } else if (opt_debug) applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BITALIGN patch"); if (patchbfi) { char *find = strstr(source, "BFI_INTX"); if (unlikely(!find)) { applog(LOG_ERR, "Unable to find BFI_INTX in source"); return NULL; } find += 7; // "BFI_INT" strncpy(find, " ", 1); if (opt_debug) applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BFI_INT"); } else if (opt_debug) applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BFI_INT patch"); clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithSource)"); return NULL; } clRetainProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)"); return NULL; } /* create a cl program executable for all the devices specified */ char CompilerOptions[256]; sprintf(CompilerOptions, "%s%i", "-DWORKSIZE=", (int)clState->work_size); //int n = 1000; //while(n--) // printf("%s", CompilerOptions); //return 1; status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Building Program (clBuildProgram)"); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_INFO, "%s", log); return NULL; } status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetPlatformInfo)"); return NULL; } /* copy over all of the generated binaries. */ if (opt_debug) applog(LOG_DEBUG, "binary size %d : %d", gpu, binary_sizes[gpu]); if (!binary_sizes[gpu]) { applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, may need to reboot!"); return NULL; } binaries[gpu] = (char *)malloc( sizeof(char)*binary_sizes[gpu]); status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)"); return NULL; } /* Patch the kernel if the hardware supports BFI_INT */ if (patchbfi) { unsigned remaining = binary_sizes[gpu]; char *w = binaries[gpu]; unsigned int start, length; /* Find 2nd incidence of .text, and copy the program's * position and length at a fixed offset from that. Then go * back and find the 2nd incidence of \x7ELF (rewind by one * from ELF) and then patch the opcocdes */ if (!advance(&w, &remaining, ".text")) {patchbfi = 0; goto build;} w++; remaining--; if (!advance(&w, &remaining, ".text")) { /* 32 bit builds only one ELF */ w--; remaining++; } memcpy(&start, w + 285, 4); memcpy(&length, w + 289, 4); w = binaries[gpu]; remaining = binary_sizes[gpu]; if (!advance(&w, &remaining, "ELF")) {patchbfi = 0; goto build;} w++; remaining--; if (!advance(&w, &remaining, "ELF")) { /* 32 bit builds only one ELF */ w--; remaining++; } w--; remaining++; w += start; remaining -= start; if (opt_debug) applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", w, remaining); patch_opcodes(w, length); status = clReleaseProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Releasing program. (clReleaseProgram)"); return NULL; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)"); return NULL; } clRetainProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)"); return NULL; } } free(source); free(rawsource); /* Save the binary to be loaded next time */ binaryfile = fopen(binaryfilename, "wb"); if (!binaryfile) { /* Not a fatal problem, just means we build it again next time */ if (opt_debug) applog(LOG_DEBUG, "Unable to create file %s", binaryfilename); } else { if (unlikely(fwrite(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu])) { applog(LOG_ERR, "Unable to fwrite to binaryfile"); return NULL; } fclose(binaryfile); } if (binaries[gpu]) free(binaries[gpu]); built: free(binaries); free(binary_sizes); applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT patching, %d vectors and worksize %d", filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size); /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Building Program (clBuildProgram)"); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_INFO, "%s", log); return NULL; } /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "search", &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)"); return NULL; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); if (status != CL_SUCCESS) /* Try again without OOE enable */ clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)"); return NULL; } clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)"); return NULL; } return clState; }
int main(int argc, char** argv) { srand(1000); int i; unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); randomInit(h_A, size_A); randomInit(h_B, size_B); unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; cl_kernel clKernel; cl_event mm; size_t dataBytes; size_t kernelLength; cl_int errcode; cl_mem d_A; cl_mem d_B; cl_mem d_C; clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); FILE* fp = fopen("hw2.cl", "r"); fseek (fp , 0 , SEEK_END); const size_t lSize = ftell(fp); rewind(fp); unsigned char* buffer; buffer = (unsigned char*) malloc (lSize); fread(buffer, 1, lSize, fp); fclose(fp); cl_int status; clProgram = clCreateProgramWithBinary(clGPUContext, 1, (const cl_device_id *)clDevices, &lSize, (const unsigned char**)&buffer, &status, &errcode); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); clKernel = clCreateKernel(clProgram, "MM", &errcode); size_t globalWorkSize[2]; int wA = WA; int wC = WC; errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); globalWorkSize[0] = 16; globalWorkSize[1] = 16; cl_ulong time_start, time_end, total_time = 0; errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, NULL, 0, NULL, &mm); printf("Average time = %lu\n"); clFinish(clCommandQue); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; printf("Average time = %lu\n", total_time); errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); free(clDevices); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue); }
WEAK void halide_init_kernels(void *user_context, const char* src, int size) { int err; cl_device_id dev; // Initialize one shared context for all Halide compiled instances if (!(*cl_ctx)) { const cl_uint maxPlatforms = 4; cl_platform_id platforms[maxPlatforms]; cl_uint platformCount = 0; err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount ); CHECK_ERR( err, "clGetPlatformIDs" ); cl_platform_id platform = NULL; // Find the requested platform, or the first if none specified. const char * name = getenv("HL_OCL_PLATFORM"); if (name != NULL) { for (cl_uint i = 0; i < platformCount; ++i) { const cl_uint maxPlatformName = 256; char platformName[maxPlatformName]; err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL ); if (err != CL_SUCCESS) continue; if (strstr(platformName, name)) { platform = platforms[i]; break; } } } else if (platformCount > 0) { platform = platforms[0]; } if (platform == NULL){ halide_printf(user_context, "Failed to find OpenCL platform\n"); return; } #ifdef DEBUG const cl_uint maxPlatformName = 256; char platformName[maxPlatformName]; err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL ); CHECK_ERR( err, "clGetPlatformInfo" ); halide_printf(user_context, "Got platform '%s', about to create context (t=%lld)\n", platformName, (long long)halide_current_time_ns(user_context)); #endif cl_device_type device_type = 0; // Find the device types requested. const char * dev_type = getenv("HL_OCL_DEVICE"); if (dev_type != NULL) { if (strstr("cpu", dev_type)) device_type |= CL_DEVICE_TYPE_CPU; if (strstr("gpu", dev_type)) device_type |= CL_DEVICE_TYPE_GPU; } // If no devices are specified yet, just use all. if (device_type == 0) device_type = CL_DEVICE_TYPE_ALL; // Make sure we have a device const cl_uint maxDevices = 4; cl_device_id devices[maxDevices]; cl_uint deviceCount = 0; err = clGetDeviceIDs( platform, device_type, maxDevices, devices, &deviceCount ); CHECK_ERR( err, "clGetDeviceIDs" ); if (deviceCount == 0) { halide_printf(user_context, "Failed to get device\n"); return; } dev = devices[deviceCount-1]; #ifdef DEBUG const cl_uint maxDeviceName = 256; char deviceName[maxDeviceName]; err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL ); CHECK_ERR( err, "clGetDeviceInfo" ); halide_printf(user_context, "Got device '%s', about to create context (t=%lld)\n", deviceName, (long long)halide_current_time_ns(user_context)); #endif // Create context cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; *cl_ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err); CHECK_ERR( err, "clCreateContext" ); // cuEventCreate(&__start, 0); // cuEventCreate(&__end, 0); halide_assert(user_context, !(*cl_q)); *cl_q = clCreateCommandQueue(*cl_ctx, dev, 0, &err); CHECK_ERR( err, "clCreateCommandQueue" ); } else { #ifdef DEBUG halide_printf(user_context, "Already had context %p\n", *cl_ctx); #endif // Maintain ref count of context. CHECK_CALL( clRetainContext(*cl_ctx), "clRetainContext" ); CHECK_CALL( clRetainCommandQueue(*cl_q), "clRetainCommandQueue" ); CHECK_CALL( clGetContextInfo(*cl_ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL), "clGetContextInfo" ); } // Initialize a module for just this Halide module if ((!__mod) && (size > 1)) { // Create module cl_device_id devices[] = { dev }; size_t lengths[] = { size }; if (strstr(src, "/*OpenCL C*/")) { // Program is OpenCL C. #ifdef DEBUG halide_printf(user_context, "Compiling OpenCL C kernel: %s\n\n", src); #endif const char * sources[] = { src }; __mod = clCreateProgramWithSource(*cl_ctx, 1, &sources[0], NULL, &err ); CHECK_ERR( err, "clCreateProgramWithSource" ); } else { // Program is SPIR binary. #ifdef DEBUG halide_printf(user_context, "Compiling SPIR kernel (%i bytes)\n", size); #endif const unsigned char * binaries[] = { (unsigned char *)src }; __mod = clCreateProgramWithBinary(*cl_ctx, 1, devices, lengths, &binaries[0], NULL, &err ); CHECK_ERR( err, "clCreateProgramWithBinary" ); } err = clBuildProgram( __mod, 1, &dev, NULL, NULL, NULL ); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; halide_printf(user_context, "Error: Failed to build program executable! err = %d\n", err); if (clGetProgramBuildInfo(__mod, dev, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len) == CL_SUCCESS) halide_printf(user_context, "Build Log:\n %s\n-----\n", buffer); else halide_printf(user_context, "clGetProgramBuildInfo failed to get build log!\n"); halide_assert(user_context, err == CL_SUCCESS); } } }
int main(int argc, char** argv) { int err; // error code returned from api calls float a1[DATA_SIZE1]; // original data set given to device float b1[FILTER_SIZE1]; // original data set given to device float c1[OUTPUT_SIZE1]; float results1[OUTPUT_SIZE1]; // results returned from device float sw_results1[OUTPUT_SIZE1]; // results returned from device unsigned int correct; // number of correct results returned size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem input_a; // device memory used for the input array cl_mem input_b; // device memory used for the input array cl_mem output; // device memory used for the output array if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return EXIT_FAILURE; } // Fill our data sets with pattern // int i = 0; for(i = 0; i < DATA_SIZE1; i++) { a1[i] = (float)1; } for(i = 0; i < OUTPUT_SIZE1; i++) { results1[i] = 0; sw_results1[i] = FILTER_SIZE1; } for(i = 0; i < FILTER_SIZE1; i++) { b1[i] = (float)1; } for(i = 0; i < OUTPUT_SIZE1; i++) { c1[i] = (float)0; } // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return EXIT_FAILURE; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return EXIT_FAILURE; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return EXIT_FAILURE; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "conv3_layer", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create the input and output arrays in device memory for our calculation // input_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE1, NULL, NULL); input_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * FILTER_SIZE1, NULL, NULL); output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * OUTPUT_SIZE1, NULL, NULL); if (!input_a || !input_b || !output) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input_a, CL_TRUE, 0, sizeof(float) * DATA_SIZE1, a1, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input_b, CL_TRUE, 0, sizeof(float) * FILTER_SIZE1, b1, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array b!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, output, CL_TRUE, 0, sizeof(float) * OUTPUT_SIZE1, c1, 0, NULL, NULL); // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else global[0] = MATRIX_RANK; global[1] = MATRIX_RANK; local[0] = MATRIX_RANK; local[1] = MATRIX_RANK; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, (size_t*)&global, (size_t*)&local, 0, NULL, NULL); #endif if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * OUTPUT_SIZE1, results1, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } clWaitForEvents(1, &readevent); printf("A\n"); for (i=0;i<DATA_SIZE1;i++) { printf("%f ",a1[i]); if (((i+1) % NUM_DATA_ROWS) == 0) printf("\n"); } printf("B\n"); for (i=0;i< FILTER_SIZE1;i++) { printf("%f ",b1[i]); if (((i+1) % NUM_MASK_ROWS) == 0) printf("\n"); } printf("res\n"); for (i=0;i< OUTPUT_SIZE1;i++) { printf("%f ",results1[i]); if (((i+1) % NUM_OUT_ROWS) == 0) printf("\n"); } // Validate our results // correct = 0; /* for(i = 0; i < OUTPUT_SIZE1; i++) { int row = i/MATRIX_RANK; int col = i%MATRIX_RANK; int running = 0; int index; for (index=0;index<MATRIX_RANK;index++) { int aIndex = row*MATRIX_RANK + index; int bIndex = col + index*MATRIX_RANK; running += a[aIndex] * b[bIndex]; } sw_results[i] = running; }*/ for (i = 0;i < OUTPUT_SIZE1; i++) if(results1[i] == sw_results1[i]) correct++; printf("Software\n"); for (i=0;i<OUTPUT_SIZE1;i++) { //printf("%0.2f ",sw_results[i]); printf("%f ",sw_results1[i]); if (((i+1) % NUM_OUT_ROWS) == 0) printf("\n"); } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, OUTPUT_SIZE1); // Shutdown and cleanup // clReleaseMemObject(input_a); clReleaseMemObject(input_b); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); if(correct == OUTPUT_SIZE1){ printf("Test passed!\n"); return EXIT_SUCCESS; } else{ printf("Test failed\n"); return EXIT_FAILURE; } }
_clState *initCl(unsigned int gpu, char *name, size_t nameSize) { _clState *clState = (_clState *)calloc(1, sizeof(_clState)); bool patchbfi = false, prog_built = false; struct cgpu_info *cgpu = &gpus[gpu]; cl_platform_id platform = NULL; char pbuff[256], vbuff[255]; cl_platform_id* platforms; cl_uint preferred_vwidth; cl_device_id *devices; cl_uint numPlatforms; cl_uint numDevices; cl_int status; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status); return NULL; } platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status); return NULL; } if (opt_platform_id >= (int)numPlatforms) { applog(LOG_ERR, "Specified platform that does not exist"); return NULL; } status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status); return NULL; } platform = platforms[opt_platform_id]; if (platform == NULL) { perror("NULL platform found!\n"); return NULL; } applog(LOG_INFO, "CL Platform vendor: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform name: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform version: %s", vbuff); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); return NULL; } if (numDevices > 0 ) { devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status); return NULL; } applog(LOG_INFO, "List of devices:"); unsigned int i; for (i = 0; i < numDevices; i++) { status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device Info", status); return NULL; } applog(LOG_INFO, "\t%i\t%s", i, pbuff); } if (gpu < numDevices) { status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device Info", status); return NULL; } applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); strncpy(name, pbuff, nameSize); } else { applog(LOG_ERR, "Invalid GPU %i", gpu); return NULL; } } else return NULL; cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status); return NULL; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); if (status != CL_SUCCESS) /* Try again without OOE enable */ clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status); return NULL; } /* Check for BFI INT support. Hopefully people don't mix devices with * and without it! */ char * extensions = (char *)malloc(1024); const char * camo = "cl_amd_media_ops"; char *find; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status); return NULL; } find = strstr(extensions, camo); if (find) clState->hasBitAlign = true; /* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */ char * devoclver = (char *)malloc(1024); const char * ocl10 = "OpenCL 1.0"; const char * ocl11 = "OpenCL 1.1"; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status); return NULL; } find = strstr(devoclver, ocl10); if (!find) { clState->hasOpenCL11plus = true; find = strstr(devoclver, ocl11); if (!find) clState->hasOpenCL12plus = true; } status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status); return NULL; } applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status); return NULL; } applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size)); size_t compute_units = 0; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), (void *)&compute_units, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_COMPUTE_UNITS", status); return NULL; } // AMD architechture got 64 compute shaders per compute unit. // Source: http://www.amd.com/us/Documents/GCN_Architecture_whitepaper.pdf clState->compute_shaders = compute_units * 64; applog(LOG_DEBUG, "Max shaders calculated %d", (int)(clState->compute_shaders)); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status); return NULL; } applog(LOG_DEBUG, "Max mem alloc size is %lu", (long unsigned int)(cgpu->max_alloc)); /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what * would have otherwise created. The filename is: * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + nf + nfactor + w + work_size + l + sizeof(long) + .bin */ char binaryfilename[255]; char filename[255]; char strbuf[32]; if (cgpu->kernelname == NULL) { applog(LOG_INFO, "No kernel specified, defaulting to ckolivas"); cgpu->kernelname = strdup("ckolivas"); } if (strcmp(cgpu->kernelname, ALEXKARNEW_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel alexkarnew is experimental."); strcpy(filename, ALEXKARNEW_KERNNAME".cl"); strcpy(binaryfilename, ALEXKARNEW_KERNNAME); } else if (strcmp(cgpu->kernelname, ALEXKAROLD_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel alexkarold is experimental."); strcpy(filename, ALEXKAROLD_KERNNAME".cl"); strcpy(binaryfilename, ALEXKAROLD_KERNNAME); } else if (strcmp(cgpu->kernelname, CKOLIVAS_KERNNAME) == 0){ strcpy(filename, CKOLIVAS_KERNNAME".cl"); strcpy(binaryfilename, CKOLIVAS_KERNNAME); } else if (strcmp(cgpu->kernelname, PSW_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel psw is experimental."); strcpy(filename, PSW_KERNNAME".cl"); strcpy(binaryfilename, PSW_KERNNAME); } else if (strcmp(cgpu->kernelname, ZUIKKIS_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel zuikkis is experimental."); strcpy(filename, ZUIKKIS_KERNNAME".cl"); strcpy(binaryfilename, ZUIKKIS_KERNNAME); /* Kernel only supports worksize 256 */ cgpu->work_size = 256; } else if (strcmp(cgpu->kernelname, DARKCOIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel darkcoin is experimental."); strcpy(filename, DARKCOIN_KERNNAME".cl"); strcpy(binaryfilename, DARKCOIN_KERNNAME); } else if (strcmp(cgpu->kernelname, QUBITCOIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel qubitcoin is experimental."); strcpy(filename, QUBITCOIN_KERNNAME".cl"); strcpy(binaryfilename, QUBITCOIN_KERNNAME); } else if (strcmp(cgpu->kernelname, QUARKCOIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel quarkcoin is experimental."); strcpy(filename, QUARKCOIN_KERNNAME".cl"); strcpy(binaryfilename, QUARKCOIN_KERNNAME); } else if (strcmp(cgpu->kernelname, FUGUECOIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel fuguecoin is experimental."); strcpy(filename, FUGUECOIN_KERNNAME".cl"); strcpy(binaryfilename, FUGUECOIN_KERNNAME); } else if (strcmp(cgpu->kernelname, INKCOIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel inkcoin is experimental."); strcpy(filename, INKCOIN_KERNNAME".cl"); strcpy(binaryfilename, INKCOIN_KERNNAME); } else if (strcmp(cgpu->kernelname, ANIMECOIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel animecoin is experimental."); strcpy(filename, ANIMECOIN_KERNNAME".cl"); strcpy(binaryfilename, ANIMECOIN_KERNNAME); } else if (strcmp(cgpu->kernelname, GROESTLCOIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel groestlcoin is experimental."); strcpy(filename, GROESTLCOIN_KERNNAME".cl"); strcpy(binaryfilename, GROESTLCOIN_KERNNAME); } else if (strcmp(cgpu->kernelname, SIFCOIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel groestlcoin is experimental."); strcpy(filename, SIFCOIN_KERNNAME".cl"); strcpy(binaryfilename, SIFCOIN_KERNNAME); } else if (strcmp(cgpu->kernelname, MYRIADCOIN_GROESTL_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel myriadcoin-groestl is experimental."); strcpy(filename, MYRIADCOIN_GROESTL_KERNNAME".cl"); strcpy(binaryfilename, MYRIADCOIN_GROESTL_KERNNAME); } else if (strcmp(cgpu->kernelname, MYRIADCOIN_SKEIN_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel myriadcoin-skein is experimental."); strcpy(filename, MYRIADCOIN_SKEIN_KERNNAME".cl"); strcpy(binaryfilename, MYRIADCOIN_SKEIN_KERNNAME); } else if (strcmp(cgpu->kernelname, MYRIADCOIN_QUBIT_KERNNAME) == 0){ applog(LOG_WARNING, "Kernel myriadcoin-qubit is experimental."); strcpy(filename, MYRIADCOIN_QUBIT_KERNNAME".cl"); strcpy(binaryfilename, MYRIADCOIN_QUBIT_KERNNAME); } else { applog(LOG_WARNING, "Kernel was not chosen."); } /* For some reason 2 vectors is still better even if the card says * otherwise, and many cards lie about their max so use 256 as max * unless explicitly set on the command line. Tahiti prefers 1 */ if (strstr(name, "Tahiti")) preferred_vwidth = 1; else if (preferred_vwidth > 2) preferred_vwidth = 2; /* All available kernels only support vector 1 */ cgpu->vwidth = 1; /* Vectors are hard-set to 1 above. */ if (likely(cgpu->vwidth)) clState->vwidth = cgpu->vwidth; else { clState->vwidth = preferred_vwidth; cgpu->vwidth = preferred_vwidth; } clState->goffset = true; if (cgpu->work_size && cgpu->work_size <= clState->max_work_size) clState->wsize = cgpu->work_size; else clState->wsize = 256; if (!cgpu->opt_lg) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); cgpu->lookup_gap = 2; } else cgpu->lookup_gap = cgpu->opt_lg; if ((strcmp(cgpu->kernelname, "zuikkis") == 0) && (cgpu->lookup_gap != 2)) { applog(LOG_WARNING, "Kernel zuikkis only supports lookup-gap = 2 (currently %d), forcing.", cgpu->lookup_gap); cgpu->lookup_gap = 2; } if (!cgpu->opt_tc) { unsigned int sixtyfours; sixtyfours = cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1; cgpu->thread_concurrency = sixtyfours * 64; if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; if (cgpu->thread_concurrency > cgpu->shaders * 5) cgpu->thread_concurrency = cgpu->shaders * 5; } applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency)); } else cgpu->thread_concurrency = cgpu->opt_tc; FILE *binaryfile; size_t *binary_sizes; char **binaries; int pl; char *source = file_contents(filename, &pl); size_t sourceSize[] = {(size_t)pl}; cl_uint slot, cpnd; slot = cpnd = 0; if (!source) return NULL; binary_sizes = (size_t *)calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binary_sizes)) { applog(LOG_ERR, "Unable to calloc binary_sizes"); return NULL; } binaries = (char **)calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binaries)) { applog(LOG_ERR, "Unable to calloc binaries"); return NULL; } strcat(binaryfilename, name); if (clState->goffset) strcat(binaryfilename, "g"); sprintf(strbuf, "lg%utc%unf%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, algorithm->nfactor); strcat(binaryfilename, strbuf); sprintf(strbuf, "w%d", (int)clState->wsize); strcat(binaryfilename, strbuf); sprintf(strbuf, "l%d", (int)sizeof(long)); strcat(binaryfilename, strbuf); strcat(binaryfilename, ".bin"); binaryfile = fopen(binaryfilename, "rb"); if (!binaryfile) { applog(LOG_DEBUG, "No binary found, generating from source"); } else { struct stat binary_stat; if (unlikely(stat(binaryfilename, &binary_stat))) { applog(LOG_DEBUG, "Unable to stat binary, generating from source"); fclose(binaryfile); goto build; } if (!binary_stat.st_size) goto build; binary_sizes[slot] = binary_stat.st_size; binaries[slot] = (char *)calloc(binary_sizes[slot], 1); if (unlikely(!binaries[slot])) { applog(LOG_ERR, "Unable to calloc binaries"); fclose(binaryfile); return NULL; } if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) { applog(LOG_ERR, "Unable to fread binaries"); fclose(binaryfile); free(binaries[slot]); goto build; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status); fclose(binaryfile); free(binaries[slot]); goto build; } fclose(binaryfile); applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename); goto built; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// build: applog(LOG_NOTICE, "Building binary %s", binaryfilename); clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status); return NULL; } /* create a cl program executable for all the devices specified */ char *CompilerOptions = (char *)calloc(1, 256); sprintf(CompilerOptions, "-I \"%s\" -I \"%s\" -I \"%skernel\" -I \".\" -D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d -D NFACTOR=%d", opt_kernel_path, sgminer_path, sgminer_path, cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize, (unsigned int)algorithm->nfactor); applog(LOG_DEBUG, "Setting worksize to %d", (int)(clState->wsize)); if (clState->vwidth > 1) applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth); if (clState->hasBitAlign) { strcat(CompilerOptions, " -D BITALIGN"); applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN"); if (!clState->hasOpenCL12plus && (strstr(name, "Cedar") || strstr(name, "Redwood") || strstr(name, "Juniper") || strstr(name, "Cypress" ) || strstr(name, "Hemlock" ) || strstr(name, "Caicos" ) || strstr(name, "Turks" ) || strstr(name, "Barts" ) || strstr(name, "Cayman" ) || strstr(name, "Antilles" ) || strstr(name, "Wrestler" ) || strstr(name, "Zacate" ) || strstr(name, "WinterPark" ))) patchbfi = true; } else applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN"); if (patchbfi) { strcat(CompilerOptions, " -D BFI_INT"); applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT"); } else applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch"); if (clState->goffset) strcat(CompilerOptions, " -D GOFFSET"); if (!clState->hasOpenCL11plus) strcat(CompilerOptions, " -D OCL1"); applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions); status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL); free(CompilerOptions); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = (char *)malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_ERR, "%s", log); return NULL; } prog_built = true; #ifdef __APPLE__ /* OSX OpenCL breaks reading off binaries with >1 GPU so always build * from source. */ goto built; #endif status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status); return NULL; } status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status); return NULL; } /* The actual compiled binary ends up in a RANDOM slot! Grr, so we have * to iterate over all the binary slots and find where the real program * is. What the heck is this!? */ for (slot = 0; slot < cpnd; slot++) if (binary_sizes[slot]) break; /* copy over all of the generated binaries. */ applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, (int)(binary_sizes[slot])); if (!binary_sizes[slot]) { applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!"); return NULL; } binaries[slot] = (char *)calloc(sizeof(char)* binary_sizes[slot], 1); status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status); return NULL; } /* Patch the kernel if the hardware supports BFI_INT but it needs to * be hacked in */ if (patchbfi) { unsigned remaining = binary_sizes[slot]; char *w = binaries[slot]; unsigned int start, length; /* Find 2nd incidence of .text, and copy the program's * position and length at a fixed offset from that. Then go * back and find the 2nd incidence of \x7ELF (rewind by one * from ELF) and then patch the opcocdes */ if (!advance(&w, &remaining, ".text")) goto build; w++; remaining--; if (!advance(&w, &remaining, ".text")) { /* 32 bit builds only one ELF */ w--; remaining++; } memcpy(&start, w + 285, 4); memcpy(&length, w + 289, 4); w = binaries[slot]; remaining = binary_sizes[slot]; if (!advance(&w, &remaining, "ELF")) goto build; w++; remaining--; if (!advance(&w, &remaining, "ELF")) { /* 32 bit builds only one ELF */ w--; remaining++; } w--; remaining++; w += start; remaining -= start; applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", w, remaining); patch_opcodes(w, length); status = clReleaseProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status); return NULL; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status); return NULL; } /* Program needs to be rebuilt */ prog_built = false; } free(source); /* Save the binary to be loaded next time */ binaryfile = fopen(binaryfilename, "wb"); if (!binaryfile) { /* Not fatal, just means we build it again next time */ applog(LOG_DEBUG, "Unable to create file %s", binaryfilename); } else { if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) { applog(LOG_ERR, "Unable to fwrite to binaryfile"); return NULL; } fclose(binaryfile); } built: if (binaries[slot]) free(binaries[slot]); free(binaries); free(binary_sizes); applog(LOG_NOTICE, "Initialising kernel %s with%s bitalign, %spatched BFI", filename, clState->hasBitAlign ? "" : "out", patchbfi ? "" : "un"); if (!prog_built) { /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = (char *)malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_ERR, "%s", log); return NULL; } } /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "search", &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status); return NULL; } size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0)); size_t bufsize = 128 * ipt * cgpu->thread_concurrency; /* Use the max alloc value which has been rounded to a power of * 2 greater >= required amount earlier */ if (bufsize > cgpu->max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %lu", gpu, (unsigned long)(cgpu->max_alloc)); applog(LOG_WARNING, "Your scrypt settings come to %lu", (unsigned long)bufsize); } applog(LOG_DEBUG, "Creating scrypt buffer sized %lu", (unsigned long)bufsize); clState->padbufsize = bufsize; /* This buffer is weird and might work to some degree even if * the create buffer call has apparently failed, so check if we * get anything back before we call it a failure. */ clState->padbuffer8 = NULL; clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); if (status != CL_SUCCESS && !clState->padbuffer8) { applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status); return NULL; } clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status); return NULL; } clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status); return NULL; } return clState; }
// Main function // ********************************************************************* int main(int argc, char **argv) { void *srcA, *srcB, *dst; // Host buffers for OpenCL test cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue; // OpenCL command que cl_device_id* cdDevices; // OpenCL device list cl_program cpProgram; // OpenCL program cl_kernel ckKernel; // OpenCL kernel cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device size_t szGlobalWorkSize[1]; // 1D var for Total # of work items size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group size_t szParmDataBytes; // Byte size of context information cl_int ciErr1, ciErr2; // Error code var int iTestN = 100000 * 8; // Size of Vectors to process // set Global and Local work size dimensions szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item szLocalWorkSize[0]= iTestN>>3; // Allocate and initialize host arrays srcA = (void *)malloc (sizeof(cl_float) * iTestN); srcB = (void *)malloc (sizeof(cl_float) * iTestN); dst = (void *)malloc (sizeof(cl_float) * iTestN); int i; // Initialize arrays with some values for (i=0;i<iTestN;i++) { ((cl_float*)srcA)[i] = cl_float(i); ((cl_float*)srcB)[i] = 2; ((cl_float*)dst)[i]=-1; } // Create OpenCL context & context cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU // Query all devices available to the context ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes); cdDevices = (cl_device_id*)malloc(szParmDataBytes); ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL); if (cdDevices) { printDevInfo(cdDevices[0]); } // Create a command queue for first device the context reported cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2); ciErr1 |= ciErr2; // Allocate the OpenCL source and result buffer memory objects on the device GMEM cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2); ciErr1 |= ciErr2; cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2); ciErr1 |= ciErr2; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2); ciErr1 |= ciErr2; ///create kernels from binary int numDevices = 1; cl_int err; ::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t)); const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*)); for (i = 0; i < numDevices; ++i) { images[i] = 0; lengths[i] = 0; } cpProgram = clCreateProgramWithBinary(cxGPUContext, numDevices,cdDevices,lengths, images, 0, &err); // Build the executable program from a binary ciErr1 |= clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); // Create the kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1); // Set the Argument values ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]); // Copy input data from host to GPU and launch kernel ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); // Read back results and check accumulated errors ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL); // Release kernel, program, and memory objects // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. free(cdDevices); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQue); clReleaseContext(cxGPUContext); // print the results int iErrorCount = 0; for (i = 0; i < iTestN; i++) { if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i]) iErrorCount++; } if (iErrorCount) { printf("MiniCL validation FAILED\n"); } else { printf("MiniCL validation SUCCESSFULL\n"); } // Free host memory, close log and return success for (i = 0; i < 3; i++) { clReleaseMemObject(cmMemObjs[i]); } free(srcA); free(srcB); free (dst); }
bool initOpenCL(ComputeEnv *env) { int r = cllib_init(); if (r < 0) { return false; } cl_uint num_plt; cl_platform_id plts[16]; clGetPlatformIDs(16, plts, &num_plt); bool found = false; cl_int err; cl_platform_id platform; cl_context context; cl_device_id dev; cl_command_queue queue; cl_kernel ker_filter, ker_filter_in1_out32, ker_filter_in128_out1; cl_kernel ker_filter_in3_out32, ker_filter_in128_out3; cl_program program = 0; for (unsigned int i=0; i<num_plt; i++) { size_t sz; cl_uint num_dev; clGetPlatformInfo(plts[i], CL_PLATFORM_NAME, 0, nullptr, &sz); std::vector<char> name(sz); clGetPlatformInfo(plts[i], CL_PLATFORM_NAME, sz, &name[0], &sz); bool is_amd = strstr(&name[0], "AMD") != NULL; bool is_apple = strstr(&name[0], "Apple") != NULL; //bool is_intel = strstr(&name[0], "Intel") != NULL; //bool is_nvidia = strstr(&name[0], "NVIDIA") != NULL; if (!is_amd && !is_apple) { continue; } clGetDeviceIDs(plts[i], CL_DEVICE_TYPE_GPU, 0, nullptr, &num_dev); if (num_dev == 0) { continue; } std::vector<cl_device_id> devs(num_dev); clGetDeviceIDs(plts[i], CL_DEVICE_TYPE_GPU, num_dev, &devs[0], &num_dev); platform = plts[i]; dev = devs[0]; cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(plts[i]), 0}; cl_context ctxt = clCreateContext(props, 1, &devs[0], NULL, NULL, &err); if (err != CL_SUCCESS) { continue; } context = ctxt; found = true; break; } if (!found) { return false; } size_t dev_name_len; clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, nullptr, &dev_name_len); std::vector<char> dev_name(dev_name_len+1); clGetDeviceInfo(dev, CL_DEVICE_NAME, dev_name_len, &dev_name[0], &dev_name_len); bool bin_avaiable = false; #if defined __linux || _WIN32 #ifdef __linux ssize_t path_len = 4; char *self_path = (char*)malloc(path_len+1); while (1) { ssize_t r = readlink("/proc/self/exe", self_path, path_len); if (r < path_len) { self_path[r] = '\0'; break; } path_len *= 2; self_path = (char*)realloc(self_path, path_len+1); } struct stat self_st; stat(self_path, &self_st); self_path = dirname(self_path); #else size_t path_len = 4; char *self_path = (char*)malloc(path_len+1); DWORD len; while (1) { len = GetModuleFileName(NULL, self_path, path_len); if (len > 0 && len != path_len) { break; } path_len *= 2; self_path = (char*)realloc(self_path, path_len+1); } WIN32_FIND_DATA self_st; HANDLE finder = FindFirstFile(self_path, &self_st); FindClose(finder); for (int si=len-1; si>=0; si--) { if (self_path[si] == '\\') { self_path[si] = '\0'; break; } } #endif std::string bin_path = std::string(self_path) + "/" + &dev_name[0] + ".bin"; FILE *binfp = fopen(bin_path.c_str(), "rb"); if (binfp) { #ifdef __linux struct stat bin_st; stat(bin_path.c_str(), &bin_st); bool old = false; if (bin_st.st_mtim.tv_sec < self_st.st_mtim.tv_sec) { old = true; } if (bin_st.st_mtim.tv_sec == self_st.st_mtim.tv_sec) { if (bin_st.st_mtim.tv_nsec < self_st.st_mtim.tv_nsec) { old = true; } } size_t bin_sz = bin_st.st_size; #else WIN32_FIND_DATA bin_st; HANDLE finder = FindFirstFile(bin_path.c_str(), &bin_st); FindClose(finder); bool old = false; uint64_t self_time = (((uint64_t)self_st.ftLastWriteTime.dwHighDateTime)<<32) | ((uint64_t)self_st.ftLastWriteTime.dwLowDateTime); uint64_t bin_time = (((uint64_t)bin_st.ftLastWriteTime.dwHighDateTime)<<32) | ((uint64_t)bin_st.ftLastWriteTime.dwLowDateTime); if (bin_time < self_time) { old = true; } size_t bin_sz = bin_st.nFileSizeLow; #endif if (!old) { unsigned char *bin = (unsigned char*)malloc(bin_sz); size_t rem = bin_sz; unsigned char *p = bin; while (rem) { size_t rsz = fread(p, 1, rem, binfp); if (rsz <= 0) { break; } rem -= rsz; p += rsz; } if (rem == 0) { cl_int err; program = clCreateProgramWithBinary(context, 1, &dev, &bin_sz, (const unsigned char**)&bin, NULL, &err); if (err == CL_SUCCESS) { bin_avaiable = true; } } free(bin); } fclose(binfp); } #endif if (! bin_avaiable) { const char *source[1] = {prog}; size_t src_len[1] = {sizeof(prog)-1}; program = clCreateProgramWithSource(context, 1, source, src_len, &err); if (err != CL_SUCCESS) { clReleaseContext(context); return false; } } #if defined __linux || defined _WIN32 free(self_path); #endif err = clBuildProgram(program, 1, &dev, "" , nullptr, nullptr); if (err != CL_SUCCESS) { size_t log_len; clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_len); std::vector<char> log(log_len+1); clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_len, &log[0], &log_len); log[log_len] = '\0'; puts(&log[0]); clReleaseProgram(program); clReleaseContext(context); return false; } #if defined __linux || _WIN32 if (!bin_avaiable) { size_t binsz; size_t ret_len; clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsz), &binsz, &ret_len); char *buffer = new char [binsz]; char *ptrs[1]; ptrs[0] = buffer; clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(ptrs), ptrs, &ret_len); FILE *fp = fopen(bin_path.c_str(), "wb"); size_t rem = binsz; char *p = buffer; while (rem) { size_t wsz = fwrite(p, 1, rem, fp); if (wsz <= 0) { fclose(fp); unlink(bin_path.c_str()); fp=NULL; break; } rem -= wsz; p += wsz; } if (fp) { fclose(fp); } delete [] buffer; } #endif ker_filter = clCreateKernel(program, "filter", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); return false; } ker_filter_in1_out32 = clCreateKernel(program, "filter_in1_out32", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); return false; } ker_filter_in3_out32 = clCreateKernel(program, "filter_in3_out32", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); return false; } ker_filter_in128_out1 = clCreateKernel(program, "filter_in128_out1", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); return false; } ker_filter_in128_out3 = clCreateKernel(program, "filter_in128_out3", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); return false; } queue = clCreateCommandQueue(context, dev, 0, &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); return false; } env->num_cl_dev = 1; env->cl_dev_list = new OpenCLDev[1]; env->cl_dev_list[0].platform = platform; env->cl_dev_list[0].context = context; env->cl_dev_list[0].devid = dev; env->cl_dev_list[0].queue = queue; env->cl_dev_list[0].program = program; env->cl_dev_list[0].ker_filter = ker_filter; env->cl_dev_list[0].ker_filter_in1_out32 = ker_filter_in1_out32; env->cl_dev_list[0].ker_filter_in128_out1 = ker_filter_in128_out1; env->cl_dev_list[0].ker_filter_in3_out32 = ker_filter_in3_out32; env->cl_dev_list[0].ker_filter_in128_out3 = ker_filter_in128_out3; env->cl_dev_list[0].name = &dev_name[0]; return true; }
int exec_dot_product_kernel(const char *program_source, size_t source_size, int n, cl_float4 *srcA, cl_float4 *srcB, cl_float *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[3]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; int i; context = poclu_create_any_context(); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } for (i = 0; i < n; ++i) { poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4); poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4); } // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcB, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * n, NULL, NULL); if (memobjs[2] == (cl_mem)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithBinary (context, 1, devices, &source_size, (const unsigned char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 3); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 3); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "dot_product", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 3); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &memobjs[2]); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 3); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 128; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 3); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[2], CL_TRUE, 0, n * sizeof(cl_float), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 3); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } for (i = 0; i < n; ++i) { poclu_bswap_cl_float_array(devices[0], (cl_float*)&dst[i], 1); poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4); poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4); } free(devices); // release kernel, program, and memory objects delete_memobjs(memobjs, 3); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
int main(int argc, char **argv) { cl_uint num; cl_int err; int platform_idx = -1; cl_platform_id *plat_ids; int i; size_t sz; cl_device_id *gpu_devs; cl_context_properties cps[3]; cl_context context; int opt; char *input; int run_size = 1024; struct AIISA_Program prog; cl_command_queue queue; int ei; int nloop = 16; struct AIISA_CodeBuffer buf; aiisa_code_buffer_init(&buf); clGetPlatformIDs(0, NULL, &num); plat_ids = (cl_platform_id*)malloc(sizeof(*plat_ids) * num); clGetPlatformIDs(num, plat_ids, NULL); while ((opt = getopt(argc, argv, "n:")) != -1) { switch (opt) { case 'n': run_size = atoi(optarg); break; default: puts("usage : run in.cl"); return 1; } } if (optind >= argc) { puts("usage : run in.cl"); return 1; } input = argv[optind]; for (i=0; i<(int)num; i++) { char name[1024]; size_t len; clGetPlatformInfo(plat_ids[i], CL_PLATFORM_VENDOR, sizeof(name), name, &len); //puts(name); if (strcmp(name, "Advanced Micro Devices, Inc.") == 0) { platform_idx = i; break; } } if (platform_idx == -1) { puts("no amd"); return -1; } clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &num); if (num == 0) { puts("no gpu"); return -1; } gpu_devs = (cl_device_id*)malloc(sizeof(gpu_devs[0]) * 1); //clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, num, gpu_devs, NULL); cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)plat_ids[platform_idx]; cps[2] = 0; context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(gpu_devs), gpu_devs, &sz); queue = clCreateCommandQueue(context, gpu_devs[0], 0, NULL); { char name[1024]; size_t sz; clGetDeviceInfo(gpu_devs[0], CL_DEVICE_NAME, sizeof(name), name, &sz); puts(name); } //puts(input); aiisa_build_binary_from_cl(&prog, context, gpu_devs[0], input); for (ei=0; ei<nloop; ei++) { cl_program cl_prog; const unsigned char *bin[1]; size_t bin_size[1]; cl_kernel ker; cl_mem in, out; size_t global_size[3]; double tb, te; tb = sec(); gen_code(&prog, &buf); bin[0] = prog.cl_binary; bin_size[0] = prog.size; cl_prog = clCreateProgramWithBinary(context, 1, gpu_devs, bin_size, bin, NULL, NULL); clBuildProgram(cl_prog, 1, gpu_devs, NULL, NULL, NULL); ker = clCreateKernel(cl_prog, "f", &err); te = sec(); printf("build : %f[usec]\n", (te-tb)*1000000); in = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err); out = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err); clSetKernelArg(ker, 0, sizeof(cl_mem), &in); clSetKernelArg(ker, 1, sizeof(cl_mem), &out); { int *ptr = (int*)clEnqueueMapBuffer(queue, in, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { ptr[i] = i; } clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL); } { int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { ptr[i] = 0xdeadbeef; } clEnqueueUnmapMemObject(queue, out, ptr, 0, NULL, NULL); } err = clFinish(queue); global_size[0] = run_size; err = clEnqueueNDRangeKernel(queue, ker, 1, NULL, global_size, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { puts("enqueue nd"); } err = clFinish(queue); if (err != CL_SUCCESS) { puts("fini"); } if (ei == 0) { int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_READ, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { printf("%d : %x\n", i, ptr[i]); } clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL); } err = clFinish(queue); clReleaseMemObject(in); clReleaseMemObject(out); clReleaseKernel(ker); clReleaseProgram(cl_prog); } return 0; }
bool initOpenCL(W2XConv *c, ComputeEnv *env, W2XConvProcessor *proc) { int dev_id = proc->dev_id; env->num_cl_dev = 1; env->cl_dev_list = new OpenCLDev[1]; const OpenCLDevListEntry *de = &dev_list[dev_id]; cl_int err; cl_device_id dev = de->dev; cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(de->plt_id), 0}; cl_context context = clCreateContext(props, 1, &dev, NULL, NULL, &err); if (err != CL_SUCCESS) { setCLError(c, dev_id, err); return false; } if (proc->sub_type == W2XCONV_PROC_OPENCL_INTEL_GPU) { env->pref_block_size = 256; } cl_command_queue queue; cl_kernel ker_filter, ker_filter_in1_out32, ker_filter_in128_out1; cl_kernel ker_filter_in3_out32, ker_filter_in128_out3; cl_program program = 0; const char *dev_name = proc->dev_name; bool bin_avaiable = false; #if ((defined __linux) && !(defined __ANDROID__)) || _WIN32 #define GENERATE_BINARY #endif #ifdef GENERATE_BINARY #ifdef __linux ssize_t path_len = 4; char *self_path = (char*)malloc(path_len+1); while (1) { ssize_t r = readlink("/proc/self/exe", self_path, path_len); if (r < path_len) { self_path[r] = '\0'; break; } path_len *= 2; self_path = (char*)realloc(self_path, path_len+1); } struct stat self_st; stat(self_path, &self_st); self_path = dirname(self_path); #else size_t path_len = 4; char *self_path = (char*)malloc(path_len+1); DWORD len; while (1) { len = GetModuleFileName(NULL, self_path, path_len); if (len > 0 && len != path_len) { break; } path_len *= 2; self_path = (char*)realloc(self_path, path_len+1); } WIN32_FIND_DATA self_st; HANDLE finder = FindFirstFile(self_path, &self_st); FindClose(finder); for (int si=len-1; si>=0; si--) { if (self_path[si] == '\\') { self_path[si] = '\0'; break; } } #endif std::string bin_path = std::string(self_path) + "/" + &dev_name[0] + ".bin"; FILE *binfp = fopen(bin_path.c_str(), "rb"); if (binfp) { #if (defined __linux) struct stat bin_st; stat(bin_path.c_str(), &bin_st); bool old = false; if (bin_st.st_mtim.tv_sec < self_st.st_mtim.tv_sec) { old = true; } if (bin_st.st_mtim.tv_sec == self_st.st_mtim.tv_sec) { if (bin_st.st_mtim.tv_nsec < self_st.st_mtim.tv_nsec) { old = true; } } size_t bin_sz = bin_st.st_size; #else WIN32_FIND_DATA bin_st; HANDLE finder = FindFirstFile(bin_path.c_str(), &bin_st); FindClose(finder); bool old = false; uint64_t self_time = (((uint64_t)self_st.ftLastWriteTime.dwHighDateTime)<<32) | ((uint64_t)self_st.ftLastWriteTime.dwLowDateTime); uint64_t bin_time = (((uint64_t)bin_st.ftLastWriteTime.dwHighDateTime)<<32) | ((uint64_t)bin_st.ftLastWriteTime.dwLowDateTime); if (bin_time < self_time) { old = true; } size_t bin_sz = bin_st.nFileSizeLow; #endif if (!old) { unsigned char *bin = (unsigned char*)malloc(bin_sz); size_t rem = bin_sz; unsigned char *p = bin; while (rem) { size_t rsz = fread(p, 1, rem, binfp); if (rsz <= 0) { break; } rem -= rsz; p += rsz; } if (rem == 0) { cl_int err; program = clCreateProgramWithBinary(context, 1, &dev, &bin_sz, (const unsigned char**)&bin, NULL, &err); if (err == CL_SUCCESS) { bin_avaiable = true; } } free(bin); } fclose(binfp); } #endif if (! bin_avaiable) { const char *source[1] = {prog}; size_t src_len[1] = {sizeof(prog)-1}; program = clCreateProgramWithSource(context, 1, source, src_len, &err); if (err != CL_SUCCESS) { clReleaseContext(context); setCLError(c, dev_id, err); return false; } } #ifdef GENERATE_BINARY free(self_path); #endif err = clBuildProgram(program, 1, &dev, "" , nullptr, nullptr); if (err != CL_SUCCESS) { size_t log_len; clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_len); std::vector<char> log(log_len+1); clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_len, &log[0], &log_len); log[log_len] = '\0'; puts(&log[0]); clReleaseProgram(program); clReleaseContext(context); setCLError(c, dev_id, err); return false; } #ifdef GENERATE_BINARY if (!bin_avaiable) { size_t binsz; size_t ret_len; clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsz), &binsz, &ret_len); char *buffer = new char [binsz]; char *ptrs[1]; ptrs[0] = buffer; clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(ptrs), ptrs, &ret_len); FILE *fp = fopen(bin_path.c_str(), "wb"); size_t rem = binsz; char *p = buffer; while (rem) { size_t wsz = fwrite(p, 1, rem, fp); if (wsz <= 0) { fclose(fp); unlink(bin_path.c_str()); fp=NULL; break; } rem -= wsz; p += wsz; } if (fp) { fclose(fp); } delete [] buffer; } #endif ker_filter = clCreateKernel(program, "filter", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); setCLError(c, dev_id, err); return false; } ker_filter_in1_out32 = clCreateKernel(program, "filter_in1_out32", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); setCLError(c, dev_id, err); return false; } ker_filter_in3_out32 = clCreateKernel(program, "filter_in3_out32", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); setCLError(c, dev_id, err); return false; } ker_filter_in128_out1 = clCreateKernel(program, "filter_in128_out1", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); setCLError(c, dev_id, err); return false; } ker_filter_in128_out3 = clCreateKernel(program, "filter_in128_out3", &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); setCLError(c, dev_id, err); return false; } queue = clCreateCommandQueue(context, dev, 0, &err); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseContext(context); clReleaseKernel(ker_filter); clReleaseKernel(ker_filter_in1_out32); setCLError(c, dev_id, err); return false; } env->num_cl_dev = 1; env->cl_dev_list = new OpenCLDev[1]; env->cl_dev_list[0].platform = de->plt_id; env->cl_dev_list[0].context = context; env->cl_dev_list[0].devid = dev; env->cl_dev_list[0].queue = queue; env->cl_dev_list[0].program = program; env->cl_dev_list[0].ker_filter = ker_filter; env->cl_dev_list[0].ker_filter_in1_out32 = ker_filter_in1_out32; env->cl_dev_list[0].ker_filter_in128_out1 = ker_filter_in128_out1; env->cl_dev_list[0].ker_filter_in3_out32 = ker_filter_in3_out32; env->cl_dev_list[0].ker_filter_in128_out3 = ker_filter_in128_out3; env->cl_dev_list[0].name = &dev_name[0]; return true; }
static void create_program_from_bitcode(char* bitcode_path) { cl_int err; unsigned int i; // Instead of passing actual executable bits, we pass a path to the // already-compiled bitcode to clCreateProgramWithBinary. Note that // you may load bitcode for multiple devices in one call by passing // multiple paths and multiple devices. In the multiple-device case, // the indices should match: if device 0 is a 32-bit GPU, then path 0 // should be bitcode for a GPU. In the example below, we are loading // bitcode for one device only. size_t len = strlen(bitcode_path); program = clCreateProgramWithBinary(context, 1, &device, &len, (const unsigned char**)&bitcode_path, NULL, &err); check_status("clCreateProgramWithBinary", err); // The above tells OpenCL how to locate the intermediate bitcode, but we // still must build the program to produce executable bits for our // *specific* device. This transforms gpu32 bitcode into actual executable // bits for an AMD or Intel compute device (for example). err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); check_status("clBuildProgram", err); // And that's it -- we have a fully-compiled program created from the // bitcode. Let's ask OpenCL for the test kernel. kernel = clCreateKernel(program, "vecadd", &err); check_status("clCreateKernel", err); // And now, let's test the kernel with some dummy data. float *host_a = (float*)malloc(sizeof(float)*4*NELEMENTS); float *host_b = (float*)malloc(sizeof(float)*4*NELEMENTS); float *host_c = (float*)malloc(sizeof(float)*4*NELEMENTS); // We pack some host buffers with our data. for (i = 0; i < NELEMENTS; i++) { host_a[i*4+0] = host_b[i*4+0] = i; host_a[i*4+1] = host_b[i*4+1] = i; host_a[i*4+2] = host_b[i*4+2] = i; host_a[i*4+3] = host_b[i*4+3] = i; } // And create and load some CL memory buffers with that host data. cl_mem a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4)*NELEMENTS, host_a, &err); cl_mem b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4)*NELEMENTS, host_b, &err); // CL buffer 'c' is for output, so we don't prepopulate it with data. cl_mem c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4)*NELEMENTS, NULL, &err); if (a == NULL || b == NULL || c == NULL) { fprintf(stderr, "Error: Unable to create OpenCL buffer memory objects.\n"); exit(1); } // We set the CL buffers as arguments for the 'vecadd' kernel. int argc = 0; err |= clSetKernelArg(kernel, argc++, sizeof(cl_mem), &a); err |= clSetKernelArg(kernel, argc++, sizeof(cl_mem), &b); err |= clSetKernelArg(kernel, argc++, sizeof(cl_mem), &c); check_status("clSetKernelArg", err); // Launch the kernel over a single dimension, which is the same size // as the number of float4s. We let OpenCL select the local dimensions // by passing 'NULL' as the 6th parameter. size_t global = NELEMENTS; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); check_status("clEnqueueNDRangeKernel", err); // Read back the results (blocking, so everything finishes), and then // validate the results. clEnqueueReadBuffer(queue, c, CL_TRUE, 0, NELEMENTS*sizeof(cl_float4), host_c, 0, NULL, NULL); int success = 1; for (i = 0; i < NELEMENTS; i++) { if ( host_c[i*4+0] != i*2.0 || host_c[i*4+1] != i * 2.0 || host_c[i*4+2] != i*2.0 || host_c[i*4+3] != i * 2.0 ) { success = 0; fprintf(stderr, "Validation failed at index %d\n", i); fprintf(stderr, "Kernel FAILED!\n"); break; } } if (success) { fprintf(stdout, "Validation successful.\n"); } }
_clState *initCl(unsigned int gpu, char *name, size_t nameSize) { _clState *clState = calloc(1, sizeof(_clState)); bool patchbfi = false, prog_built = false; struct cgpu_info *cgpu = &gpus[gpu]; cl_platform_id platform = NULL; char pbuff[256], vbuff[255]; cl_platform_id* platforms; cl_uint preferred_vwidth; cl_device_id *devices; cl_uint numPlatforms; cl_uint numDevices; cl_int status; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status); return NULL; } platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status); return NULL; } if (opt_platform_id >= (int)numPlatforms) { applog(LOG_ERR, "Specified platform that does not exist"); return NULL; } status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status); return NULL; } platform = platforms[opt_platform_id]; if (platform == NULL) { perror("NULL platform found!\n"); return NULL; } applog(LOG_INFO, "CL Platform vendor: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform name: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform version: %s", vbuff); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); return NULL; } if (numDevices > 0 ) { devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status); return NULL; } applog(LOG_INFO, "List of devices:"); unsigned int i; for (i = 0; i < numDevices; i++) { status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device Info", status); return NULL; } applog(LOG_INFO, "\t%i\t%s", i, pbuff); } if (gpu < numDevices) { status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device Info", status); return NULL; } applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); strncpy(name, pbuff, nameSize); } else { applog(LOG_ERR, "Invalid GPU %i", gpu); return NULL; } } else return NULL; cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status); return NULL; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); if (status != CL_SUCCESS) /* Try again without OOE enable */ clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status); return NULL; } /* Check for BFI INT support. Hopefully people don't mix devices with * and without it! */ char * extensions = malloc(1024); const char * camo = "cl_amd_media_ops"; char *find; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status); return NULL; } find = strstr(extensions, camo); if (find) clState->hasBitAlign = true; /* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */ char * devoclver = malloc(1024); const char * ocl10 = "OpenCL 1.0"; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status); return NULL; } find = strstr(devoclver, ocl10); if (!find) clState->hasOpenCL11plus = true; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status); return NULL; } applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status); return NULL; } applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status); return NULL; } applog(LOG_DEBUG, "Max mem alloc size is %u", cgpu->max_alloc); /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would * have otherwise created. The filename is: * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin * For scrypt the filename is: * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin */ char binaryfilename[255]; char filename[255]; char numbuf[16]; if (cgpu->kernel == KL_NONE) { if (opt_scrypt) { applog(LOG_INFO, "Selecting scrypt kernel"); clState->chosen_kernel = KL_SCRYPT; } else if (!strstr(name, "Tahiti") && /* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */ (strstr(vbuff, "844.4") || // Linux 64 bit ATI 2.6 SDK strstr(vbuff, "851.4") || // Windows 64 bit "" strstr(vbuff, "831.4") || strstr(vbuff, "898.1") || // 12.2 driver SDK strstr(vbuff, "923.1") || // 12.4 strstr(vbuff, "938.2") || // SDK 2.7 strstr(vbuff, "1113.2"))) {// SDK 2.8 applog(LOG_INFO, "Selecting diablo kernel"); clState->chosen_kernel = KL_DIABLO; /* Detect all 7970s, older ATI and NVIDIA and use poclbm */ } else if (strstr(name, "Tahiti") || !clState->hasBitAlign) { applog(LOG_INFO, "Selecting poclbm kernel"); clState->chosen_kernel = KL_POCLBM; /* Use phatk for the rest R5xxx R6xxx */ } else { applog(LOG_INFO, "Selecting phatk kernel"); clState->chosen_kernel = KL_PHATK; } cgpu->kernel = clState->chosen_kernel; } else { clState->chosen_kernel = cgpu->kernel; if (clState->chosen_kernel == KL_PHATK && (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") || strstr(vbuff, "831.4") || strstr(vbuff, "898.1") || strstr(vbuff, "923.1") || strstr(vbuff, "938.2") || strstr(vbuff, "1113.2"))) { applog(LOG_WARNING, "WARNING: You have selected the phatk kernel."); applog(LOG_WARNING, "You are running SDK 2.6+ which performs poorly with this kernel."); applog(LOG_WARNING, "Downgrade your SDK and delete any .bin files before starting again."); applog(LOG_WARNING, "Or allow cgminer to automatically choose a more suitable kernel."); } } /* For some reason 2 vectors is still better even if the card says * otherwise, and many cards lie about their max so use 256 as max * unless explicitly set on the command line. Tahiti prefers 1 */ if (strstr(name, "Tahiti")) preferred_vwidth = 1; else if (preferred_vwidth > 2) preferred_vwidth = 2; switch (clState->chosen_kernel) { case KL_POCLBM: strcpy(filename, POCLBM_KERNNAME".cl"); strcpy(binaryfilename, POCLBM_KERNNAME); break; case KL_PHATK: strcpy(filename, PHATK_KERNNAME".cl"); strcpy(binaryfilename, PHATK_KERNNAME); break; case KL_DIAKGCN: strcpy(filename, DIAKGCN_KERNNAME".cl"); strcpy(binaryfilename, DIAKGCN_KERNNAME); break; case KL_SCRYPT: strcpy(filename, SCRYPT_KERNNAME".cl"); strcpy(binaryfilename, SCRYPT_KERNNAME); /* Scrypt only supports vector 1 */ cgpu->vwidth = 1; break; case KL_NONE: /* Shouldn't happen */ case KL_DIABLO: strcpy(filename, DIABLO_KERNNAME".cl"); strcpy(binaryfilename, DIABLO_KERNNAME); break; } if (cgpu->vwidth) clState->vwidth = cgpu->vwidth; else { clState->vwidth = preferred_vwidth; cgpu->vwidth = preferred_vwidth; } if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) && clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt) clState->goffset = true; if (cgpu->work_size && cgpu->work_size <= clState->max_work_size) clState->wsize = cgpu->work_size; else if (strstr(name, "Tahiti")) clState->wsize = 64; else clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth; cgpu->work_size = clState->wsize; #ifdef USE_SCRYPT if (opt_scrypt) { if (!cgpu->opt_lg) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); cgpu->lookup_gap = 2; } else cgpu->lookup_gap = cgpu->opt_lg; if (!cgpu->opt_tc) { unsigned int sixtyfours; sixtyfours = cgpu->max_alloc / 131072 / 64 - 1; cgpu->thread_concurrency = sixtyfours * 64; if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; if (cgpu->thread_concurrency > cgpu->shaders * 5) cgpu->thread_concurrency = cgpu->shaders * 5; } applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, cgpu->thread_concurrency); } else cgpu->thread_concurrency = cgpu->opt_tc; } #endif FILE *binaryfile; size_t *binary_sizes; char **binaries; int pl; char *source = file_contents(filename, &pl); size_t sourceSize[] = {(size_t)pl}; cl_uint slot, cpnd; slot = cpnd = 0; if (!source) return NULL; binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binary_sizes)) { applog(LOG_ERR, "Unable to calloc binary_sizes"); return NULL; } binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binaries)) { applog(LOG_ERR, "Unable to calloc binaries"); return NULL; } strcat(binaryfilename, name); if (clState->goffset) strcat(binaryfilename, "g"); if (opt_scrypt) { #ifdef USE_SCRYPT sprintf(numbuf, "lg%utc%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency); strcat(binaryfilename, numbuf); #endif } else { sprintf(numbuf, "v%d", clState->vwidth); strcat(binaryfilename, numbuf); } sprintf(numbuf, "w%d", (int)clState->wsize); strcat(binaryfilename, numbuf); sprintf(numbuf, "l%d", (int)sizeof(long)); strcat(binaryfilename, numbuf); strcat(binaryfilename, ".bin"); binaryfile = fopen(binaryfilename, "rb"); if (!binaryfile) { applog(LOG_DEBUG, "No binary found, generating from source"); } else { struct stat binary_stat; if (unlikely(stat(binaryfilename, &binary_stat))) { applog(LOG_DEBUG, "Unable to stat binary, generating from source"); fclose(binaryfile); goto build; } if (!binary_stat.st_size) goto build; binary_sizes[slot] = binary_stat.st_size; binaries[slot] = (char *)calloc(binary_sizes[slot], 1); if (unlikely(!binaries[slot])) { applog(LOG_ERR, "Unable to calloc binaries"); fclose(binaryfile); return NULL; } if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) { applog(LOG_ERR, "Unable to fread binaries"); fclose(binaryfile); free(binaries[slot]); goto build; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status); fclose(binaryfile); free(binaries[slot]); goto build; } fclose(binaryfile); applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename); goto built; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// build: clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status); return NULL; } /* create a cl program executable for all the devices specified */ char *CompilerOptions = calloc(1, 256); #ifdef USE_SCRYPT if (opt_scrypt) sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize); else #endif { sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", (int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth); } applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize); if (clState->vwidth > 1) applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth); if (clState->hasBitAlign) { strcat(CompilerOptions, " -D BITALIGN"); applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN"); if (strstr(name, "Cedar") || strstr(name, "Redwood") || strstr(name, "Juniper") || strstr(name, "Cypress" ) || strstr(name, "Hemlock" ) || strstr(name, "Caicos" ) || strstr(name, "Turks" ) || strstr(name, "Barts" ) || strstr(name, "Cayman" ) || strstr(name, "Antilles" ) || strstr(name, "Wrestler" ) || strstr(name, "Zacate" ) || strstr(name, "WinterPark" )) patchbfi = true; } else applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN"); if (patchbfi) { strcat(CompilerOptions, " -D BFI_INT"); applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT"); } else applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch"); if (clState->goffset) strcat(CompilerOptions, " -D GOFFSET"); if (!clState->hasOpenCL11plus) strcat(CompilerOptions, " -D OCL1"); applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions); status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL); free(CompilerOptions); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_ERR, "%s", log); return NULL; } prog_built = true; status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status); return NULL; } status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status); return NULL; } /* The actual compiled binary ends up in a RANDOM slot! Grr, so we have * to iterate over all the binary slots and find where the real program * is. What the heck is this!? */ for (slot = 0; slot < cpnd; slot++) if (binary_sizes[slot]) break; /* copy over all of the generated binaries. */ applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, binary_sizes[slot]); if (!binary_sizes[slot]) { applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!"); return NULL; } binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1); status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status); return NULL; } /* Patch the kernel if the hardware supports BFI_INT but it needs to * be hacked in */ if (patchbfi) { unsigned remaining = binary_sizes[slot]; char *w = binaries[slot]; unsigned int start, length; /* Find 2nd incidence of .text, and copy the program's * position and length at a fixed offset from that. Then go * back and find the 2nd incidence of \x7ELF (rewind by one * from ELF) and then patch the opcocdes */ if (!advance(&w, &remaining, ".text")) goto build; w++; remaining--; if (!advance(&w, &remaining, ".text")) { /* 32 bit builds only one ELF */ w--; remaining++; } memcpy(&start, w + 285, 4); memcpy(&length, w + 289, 4); w = binaries[slot]; remaining = binary_sizes[slot]; if (!advance(&w, &remaining, "ELF")) goto build; w++; remaining--; if (!advance(&w, &remaining, "ELF")) { /* 32 bit builds only one ELF */ w--; remaining++; } w--; remaining++; w += start; remaining -= start; applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", w, remaining); patch_opcodes(w, length); status = clReleaseProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status); return NULL; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status); return NULL; } /* Program needs to be rebuilt */ prog_built = false; } free(source); /* Save the binary to be loaded next time */ binaryfile = fopen(binaryfilename, "wb"); if (!binaryfile) { /* Not a fatal problem, just means we build it again next time */ applog(LOG_DEBUG, "Unable to create file %s", binaryfilename); } else { if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) { applog(LOG_ERR, "Unable to fwrite to binaryfile"); return NULL; } fclose(binaryfile); } built: if (binaries[slot]) free(binaries[slot]); free(binaries); free(binary_sizes); applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d", filename, clState->hasBitAlign ? "" : "out", clState->vwidth, clState->wsize); if (!prog_built) { /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_ERR, "%s", log); return NULL; } } /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "search", &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status); return NULL; } #ifdef USE_SCRYPT if (opt_scrypt) { size_t ipt = (2048 / cgpu->lookup_gap + (2048 % cgpu->lookup_gap > 0)); size_t bufsize = 128 * ipt * cgpu->thread_concurrency; /* Use the max alloc value which has been rounded to a power of * 2 greater >= required amount earlier */ if (bufsize > cgpu->max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u", gpu, cgpu->max_alloc); applog(LOG_WARNING, "Your scrypt settings come to %u", bufsize); } applog(LOG_DEBUG, "Creating scrypt buffer sized %u", bufsize); clState->padbufsize = bufsize; /* This buffer is weird and might work to some degree even if * the create buffer call has apparently failed, so check if we * get anything back before we call it a failure. */ clState->padbuffer8 = NULL; clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); if (status != CL_SUCCESS && !clState->padbuffer8) { applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease CT or increase LG", status); return NULL; } clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status); return NULL; } } #endif clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status); return NULL; } return clState; }
int BinomialOption::setupCL() { cl_int status = CL_SUCCESS; size_t deviceListSize; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if(isPlatformEnabled()) { platform = platforms[platformId]; } else { for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetContextInfo failed.")) { return SDK_FAILURE; } int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id *)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetContextInfo failed.")) { return SDK_FAILURE; } /* Create command queue */ commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateCommandQueue failed.")) { return SDK_FAILURE; } /* Get Device specific Information */ status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo" "CL_DEVICE_MAX_WORK_GROUP_SIZE failed.")) return SDK_FAILURE; status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDimensions, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo" "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) { return SDK_FAILURE; } maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, (void*)maxWorkItemSizes, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo" "CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) { return SDK_FAILURE; } status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void*)&totalLocalMemory, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo" "CL_DEVICE_LOCAL_MEM_SIZE failed.")) { return SDK_FAILURE; } /** * Create and initialize memory objects */ /* Create memory object for stock price */ randBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, numSamples * sizeof(cl_float4), randArray, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateBuffer failed. (randBuffer)")) { return SDK_FAILURE; } /* Create memory object for output array */ outBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, numSamples * sizeof(cl_float4), output, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateBuffer failed. (outBuffer)")) { return SDK_FAILURE; } /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t*)&binarySize, (const unsigned char**)&binary, NULL, &status); } else { // special case for packetized OpenCL (can not yet compile .cl directly) char vName[100]; status = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(vName), vName, NULL); const bool platformIsPacketizedOpenCL = !strcmp(vName, "Ralf Karrenberg, Saarland University"); if (!strcmp(vName, "Intel(R) Corporation")) { vendorName = "intel"; } else if (!strcmp(vName, "Advanced Micro Devices, Inc.")) { vendorName = "amd"; } else if (platformIsPacketizedOpenCL) { vendorName = "pkt"; } else { printf("ERROR: vendor not recognized: %s\n", vName); } kernelPath.append("BinomialOption_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) { return SDK_FAILURE; } std::string flagsStr = std::string(""); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, &devices[deviceId], flagsStr.c_str(), NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) { return SDK_FAILURE; } } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel(program, "binomial_options", &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateKernel failed.")) { return SDK_FAILURE; } /* Get kernel work group size */ status = clGetKernelWorkGroupInfo(kernel, devices[deviceId], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetKernelWorkGroupInfo failed.")) { return SDK_FAILURE; } /* If group-size is gerater than maximum supported on kernel */ if((size_t)(numSteps + 1) > kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << (numSteps + 1) << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl; std::cout << "Using appropiate group-size." << std::endl; std::cout << "-------------------------------------------" << std::endl; } numSteps = (cl_int)kernelWorkGroupSize - 2; } return SDK_SUCCESS; }
int DeviceFission::setupCLRuntime() { cl_int status = CL_SUCCESS; // Create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("DeviceFission_Kernels.cl"); buildData.devices = Devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); // Get allocate memory for subCmdQueue subCmdQueue = (cl_command_queue*)malloc(numSubDevices * sizeof(cl_command_queue)); CHECK_ALLOCATION(subCmdQueue,"Failed to allocate memory. (subCmdQueue)"); // Create command queue subCmdQueue for(cl_uint i = 0; i < numSubDevices; i++) { // Create command queue subCmdQueue[i] = clCreateCommandQueue(rContext, subDevices[i], 0, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (subCmdQueue)"); } // Create command queue gpuCmdQueue gpuCmdQueue = clCreateCommandQueue(rContext, gpuDevice, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (gpuCmdQueue)"); // Create memory objects for input InBuf = clCreateBuffer(rContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, length * sizeof(cl_int), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (InBuf)"); // Get allocate memory for sub devices output subOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem)); for(cl_uint i = 0; i < numSubDevices; i++) { // Create memory objects for sub devices output subOutBuf[i] = clCreateBuffer(rContext, CL_MEM_WRITE_ONLY, half_length * sizeof(cl_int) , NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (subOutBuf)"); } // Get allocate memory for GPU device output gpuOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem)); for(cl_uint i = 0; i < numSubDevices; i++) { // Create memory objects for GPU device output gpuOutBuf[i] = clCreateBuffer(rContext, CL_MEM_WRITE_ONLY, half_length * sizeof(cl_int) , NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (gpuOutBuf)"); } streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); char * source = NULL; size_t sourceSize[] = {0}; char * binary = NULL; size_t binarySize = 0; if(isLoadBinaryEnabled()) { kernelPath += loadBinary; if(kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } // Get binaries and binary sizes for CPU devices char** subBinaries = (char**)malloc(numSubDevices * sizeof(char*)); if(subBinaries == NULL) { sampleCommon->error("Failed to allocate memory(subBinaries)"); return SDK_FAILURE; } size_t* subBinariesSize = (size_t*)malloc(numSubDevices * sizeof(size_t*)); if(subBinariesSize == NULL) { sampleCommon->error("Failed to allocate memory(subBinariesSize)"); return SDK_FAILURE; } for(cl_uint i = 0; i < numSubDevices; ++i) { subBinaries[i] = (char*)kernelFile.source().c_str(); subBinariesSize[i] = kernelFile.source().size(); } subProgram = clCreateProgramWithBinary(rContext, numSubDevices, subDevices, (const size_t *)subBinariesSize, (const unsigned char**)subBinaries, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(subProgram)"); streamsdk::SDKFile kernelFileGPU; std::string kernelPathGPU = sampleCommon->getPath(); if(!gpuAvailable) { loadBinaryGPU = loadBinary; } kernelPathGPU += loadBinaryGPU; if(loadBinaryGPU.length() == 0) { std::cout << "Failed to load GPU kernel file, please assign it by '--loadgpu'. "<< std::endl; return SDK_FAILURE; } if(kernelFileGPU.readBinaryFromFile(kernelPathGPU.c_str())) { std::cout << "Failed to load GPU kernel file : " << kernelPathGPU << std::endl; return SDK_FAILURE; } // Get binaries and binary sizes for GPU device char* subBinariesGPU; size_t subBinariesSizeGPU;; subBinariesGPU = (char*)kernelFileGPU.source().c_str(); subBinariesSizeGPU = kernelFileGPU.source().size(); gpuProgram = clCreateProgramWithBinary(rContext, 1, &gpuDevice, &subBinariesSizeGPU, (const unsigned char **)&subBinariesGPU, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(gpuProgram)"); free(subBinaries); free(subBinariesSize); subBinariesSize = NULL; subBinaries = NULL; } else { kernelPath.append("DeviceFission_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str()))//bool { std::cout << "Failed to load kernel file: " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; // Create a CL program for sub-devices using the kernel source subProgram = clCreateProgramWithSource(rContext, 1, (const char**)&source, sourceSize, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(subProgram)"); // Create a CL program for GPU device using the kernel source gpuProgram = clCreateProgramWithSource(rContext, 1, (const char**)&source, sourceSize, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(gpuProgram)"); } // Get build options const char *flags; streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); if(buildData.flagsFileName.size() != 0) { flagsPath.append(buildData.flagsFileName.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); flags = flagsFile.source().c_str(); if(strlen(flags) != 0) std::cout << "Build Options are : " << flags << std::endl; } else { flags = NULL; } // Create a cl program executable for all sub-devices status = clBuildProgram(subProgram, numSubDevices, subDevices, flags, NULL, NULL); CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(subProgram)"); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(subProgram, subDevices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(NULL == buildLog) { sampleCommon->error("Failed to allocate host memory.(buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(subProgram, subDevices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG(SUB-DEVICES)\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed. (SUB-DEVICES)")) return SDK_FAILURE; } // Create a cl program executable for GPU device status = clBuildProgram(gpuProgram, 1, &gpuDevice, flags, NULL, NULL); CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(gpuProgram)"); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(gpuProgram, gpuDevice, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(NULL == buildLog) { sampleCommon->error("Failed to allocate host memory.(buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(gpuProgram, gpuDevice, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG(GPU-DEVICE)\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed. (GPU-DEVICE)")) return SDK_FAILURE; } // Get a kernel object handle for a kernel with the given name subKernel[0] = clCreateKernel(subProgram, "Add", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[0])"); // Get a kernel object handle for a kernel with the given name subKernel[1] = clCreateKernel(subProgram, "Sub", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[1])"); // Get a kernel object handle for a kernel with the given name gpuKernel[0] = clCreateKernel(gpuProgram, "Add", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(gpuKernel[0])"); // Get a kernel object handle for a kernel with the given name gpuKernel[1] = clCreateKernel(gpuProgram, "Sub", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(gpuKernel[1])"); return SDK_SUCCESS; }
void init_platform() { cl_uint num_platforms; cl_uint num_devices; // Get the platform ID status = clGetPlatformIDs(1, &platform, &num_platforms); if(status != CL_SUCCESS) { printf("Failed clGetPlatformIDs. %d", status); freeResources(); exit (1); } if(num_platforms != 1) { printf("Found %d platforms!\n", num_platforms); freeResources(); exit (1); } // Get the device ID status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &num_devices); if(status != CL_SUCCESS) { printf("Failed clGetDeviceIDs. %d", status); freeResources(); exit (1); } if(num_devices != 1) { printf("Found %d devices!\n", num_devices); freeResources(); exit (1); } // Create a context context = clCreateContext(0, 1, &device, NULL, NULL, &status); if(status != CL_SUCCESS) { printf("Failed clCreateContext. %d", status); freeResources(); exit (1); } queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status); if(status != CL_SUCCESS) { printf("Failed to create queue. Error %d", status); freeResources(); exit (1); } // Create the program using binary already compiled offline using aoc (i.e. the .aocx file) FILE* fp = fopen(AOCX_FILE, "rb"); if (fp == NULL) { printf("Failed to open %s file (fopen).\n", AOCX_FILE); exit(1); } fseek(fp, 0, SEEK_END); size_t binary_length = ftell(fp); unsigned char*binary = (unsigned char*) malloc(sizeof(unsigned char) * binary_length); assert(binary && "Malloc failed"); rewind(fp); if (fread((void*)binary, binary_length, 1, fp) == 0) { printf("Failed to read from moving_average.aocx file (fread).\n"); exit (1); } fclose(fp); cl_int kernel_status; program = clCreateProgramWithBinary(context, 1, &device, &binary_length, (const unsigned char**)&binary, &kernel_status, &status); if(status != CL_SUCCESS || kernel_status != CL_SUCCESS) { printf("Failed clCreateProgramWithBinary. %d", status); freeResources(); exit (1); } // Build the program status = clBuildProgram(program, 0, NULL, "", NULL, NULL); if(status != CL_SUCCESS) { printf("Failed clBuildProgram. %d", status); freeResources(); exit (1); } }
void TexDecoder_OpenCL_Initialize() { if(!g_Inited) { if(!OpenCL::Initialize()) return; cl_int err = 1; size_t binary_size = 0; char *binary = NULL; char *header = NULL; size_t nDevices = 0; cl_device_id *devices = NULL; size_t *binary_sizes = NULL; char **binaries = NULL; std::string filename; char dolphin_rev[HEADER_SIZE]; filename = File::GetUserPath(D_OPENCL_IDX) + "kernel.bin"; snprintf(dolphin_rev, HEADER_SIZE, "%-31s", scm_rev_str); { File::IOFile input(filename, "rb"); if (!input) { binary_size = 0; } else { binary_size = input.GetSize(); header = new char[HEADER_SIZE]; binary = new char[binary_size]; input.ReadBytes(header, HEADER_SIZE); input.ReadBytes(binary, binary_size); } } if (binary_size > 0) { if (binary_size > HEADER_SIZE) { if (strncmp(header, dolphin_rev, HEADER_SIZE) == 0) { g_program = clCreateProgramWithBinary(OpenCL::GetContext(), 1, &OpenCL::device_id, &binary_size, (const unsigned char**)&binary, NULL, &err); if (err != CL_SUCCESS) { OpenCL::HandleCLError(err, "clCreateProgramWithBinary"); } if (!err) { err = clBuildProgram(g_program, 1, &OpenCL::device_id, NULL, NULL, NULL); if (err != CL_SUCCESS) { OpenCL::HandleCLError(err, "clBuildProgram"); } } } } delete [] header; delete [] binary; } // If an error occurred using the kernel binary, recompile the kernels if (err) { std::string code; filename = File::GetSysDirectory() + OPENCL_DIR DIR_SEP "TextureDecoder.cl"; if (!File::ReadFileToString(filename.c_str(), code)) { ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename.c_str()); return; } g_program = OpenCL::CompileProgram(code.c_str()); err = clGetProgramInfo(g_program, CL_PROGRAM_NUM_DEVICES, sizeof(nDevices), &nDevices, NULL); if (err != CL_SUCCESS) { OpenCL::HandleCLError(err, "clGetProgramInfo"); } devices = (cl_device_id *)malloc( sizeof(cl_device_id) *nDevices); err = clGetProgramInfo(g_program, CL_PROGRAM_DEVICES, sizeof(cl_device_id)*nDevices, devices, NULL); if (err != CL_SUCCESS) { OpenCL::HandleCLError(err, "clGetProgramInfo"); } binary_sizes = (size_t *)malloc(sizeof(size_t)*nDevices); err = clGetProgramInfo(g_program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL); if (err != CL_SUCCESS) { OpenCL::HandleCLError(err, "clGetProgramInfo"); } binaries = (char **)malloc(sizeof(char *)*nDevices); for (u32 i = 0; i < nDevices; ++i) { if (binary_sizes[i] != 0) { binaries[i] = (char *)malloc(HEADER_SIZE + binary_sizes[i]); } else { binaries[i] = NULL; } } err = clGetProgramInfo( g_program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL ); if (err != CL_SUCCESS) { OpenCL::HandleCLError(err, "clGetProgramInfo"); } if (!err) { filename = File::GetUserPath(D_OPENCL_IDX) + "kernel.bin"; File::IOFile output(filename, "wb"); if (!output) { binary_size = 0; } else { // Supporting one OpenCL device for now output.WriteBytes(dolphin_rev, HEADER_SIZE); output.WriteBytes(binaries[0], binary_sizes[0]); } } for (u32 i = 0; i < nDevices; ++i) { if (binary_sizes[i] != 0) { free(binaries[i]); } } if (binaries != NULL) free(binaries); if (binary_sizes != NULL) free(binary_sizes); if (devices != NULL) free(devices); } for (int i = 0; i <= GX_TF_CMPR; ++i) { if (g_DecodeParametersNative[i].name) g_DecodeParametersNative[i].kernel = OpenCL::CompileKernel(g_program, g_DecodeParametersNative[i].name); if (g_DecodeParametersRGBA[i].name) g_DecodeParametersRGBA[i].kernel = OpenCL::CompileKernel(g_program, g_DecodeParametersRGBA[i].name); } // Allocating maximal Wii texture size in advance, so that we don't have to allocate/deallocate per texture #ifndef DEBUG_OPENCL g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL); g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL); #endif g_Inited = true; } }
cl_kernel CLContext::generate_kernel_id ( CLKernel *kernel, const uint8_t *source, size_t length, CLContext::KernelBuildType type) { struct CLProgram { cl_program id; CLProgram () : id (NULL) {} ~CLProgram () { if (id) clReleaseProgram (id); } }; CLProgram program; cl_kernel kernel_id = NULL; cl_int error_code = CL_SUCCESS; cl_device_id device_id = _device->get_device_id (); const char * name = kernel->get_kernel_name (); XCAM_ASSERT (source && length); XCAM_ASSERT (name); switch (type) { case KERNEL_BUILD_SOURCE: program.id = clCreateProgramWithSource ( _context_id, 1, (const char**)(&source), (const size_t *)&length, &error_code); break; case KERNEL_BUILD_BINARY: program.id = clCreateProgramWithBinary ( _context_id, 1, &device_id, (const size_t *)&length, (const uint8_t**)(&source), NULL, &error_code); break; } XCAM_FAIL_RETURN ( WARNING, error_code == CL_SUCCESS, NULL, "cl create program failed with error_cod:%d", error_code); XCAM_ASSERT (program.id); error_code = clBuildProgram (program.id, 1, &device_id, NULL, CLContext::program_pfn_notify, this); if (error_code != CL_SUCCESS) { char error_log [XCAM_CL_MAX_STR_SIZE]; xcam_mem_clear (error_log); clGetProgramBuildInfo (program.id, device_id, CL_PROGRAM_BUILD_LOG, sizeof (error_log) - 1, error_log, NULL); XCAM_LOG_WARNING ("CL build program failed on %s, build log:%s", name, error_log); return NULL; } kernel_id = clCreateKernel (program.id, name, &error_code); XCAM_FAIL_RETURN ( WARNING, error_code == CL_SUCCESS, NULL, "cl create kernel(%s) failed with error_cod:%d", name, error_code); return kernel_id; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program binary */ unsigned char *bin; size_t bin_len; cl_int bin_ret; /* Read program binary */ if (argc == 2) bin = read_buffer((char *)argv[1], &bin_len); else { printf("error: No binary specified\n"); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithBinary(context, 1, &device, &bin_len, (const unsigned char **)&bin, &bin_ret, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithBinary' failed\n"); exit(1); } if (bin_ret != CL_SUCCESS) { printf("error: Invalid binary for device\n"); exit(1); } printf("program=%p\n", program); /* Free binary */ free(bin); printf("program binary loaded\n"); printf("\n"); ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object*/ cl_kernel kernel; kernel = clCreateKernel(program, "subtract_floatfloat", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_float *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_float)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_float)(2.0); /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_float *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_float)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_float)(2.0); /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_float *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_float)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main() { cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; float mem[MEM_SIZE]; FILE *fp; char fileName[] = "./kernel.clbin"; size_t binary_size; char *binary_buf; cl_int binary_status; cl_int i; /* カーネルを含むオブジェクトファイルをロード */ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } binary_buf = (char *)malloc(MAX_BINARY_SIZE); binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp ); fclose( fp ); /* データを初期化 */ for( i = 0; i < MEM_SIZE; i++ ) { mem[i] = i; } /* プラットフォーム・デバイスの情報の取得 */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* OpenCLコンテキストの作成 */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* コマンドキューの作成 */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* メモリバッファの作成 */ memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret); /* メモリバッファにデータを転送 */ ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 読み込んだバイナリからカーネルプログラムを作成 */ program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, (const unsigned char **)&binary_buf, &binary_status, &ret); /* OpenCLカーネルの作成 */ kernel = clCreateKernel(program, "vecAdd", &ret); printf("err:%d\n", ret); /* OpenCLカーネル引数の設定 */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj); size_t global_work_size[3] = {MEM_SIZE, 0, 0}; size_t local_work_size[3] = {MEM_SIZE, 0, 0}; /* OpenCLカーネルを実行 */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); /* メモリバッファから結果を取得 */ ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 結果の表示 */ for(i=0; i<MEM_SIZE; i++) { printf("mem[%d] : %f\n", i, mem[i]); } /* 終了処理 */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(binary_buf); return 0; }