Esempio n. 1
2
int main(int argc, char** argv)
{
  int err;                            // error code returned from api calls
  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel

  size_t global[2];                   // global domain size for our calculation
  size_t local[2];                    // local domain size for our calculation

  char cl_platform_vendor[1001];
  char cl_platform_name[1001];
   

  cl_mem in_array;                     // device memory used for the input array
  //cl_mem synaptic_weights;             // device memory used for the input array
  cl_mem out_array;                    // device memory used for the output array
   
  if (argc != 2){
    printf("%s <inputfile>\n", argv[0]);
    return -1;
  }

	//float in_array[NO_NODES];
	//float out_array[NO_NODES];
	//float synaptic_weights[NO_NODES*NO_NODES];
	float in_array_tb[NO_NODES];
	float out_array_tb[NO_NODES];
	//float synaptic_weights_tb[NO_NODES*NO_NODES];
	float temp =0;
	int i = 0;
    	int j = 0;
	int index = 0;
	FILE* ifp;
	char* mode = "r";
  //
  // Connect to first platform
  //
  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return -1;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);
 
  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return -1;
  }
  
  //
  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return -1;
  }

  //relu_1(in_array,synaptic_weights,out_array);


  // Fill our data sets with pattern
  //
  //int i = 0;
  //for(i = 0; i < DATA_SIZE; i++) {
  //  a[i] = (int)i;
  //  b[i] = (int)i;
  //  results[i] = 0;
  //}
  //
  
  
  // Create a command commands
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return -1;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char *xclbin=argv[1];
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return -1;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    printf("err : %d %s\n",err,err);
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return -1;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "relu_1", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel!\n");
    printf("Test failed\n");
    return -1;
  }

  // Create the input and output arrays in device memory for our calculation
  //
  in_array = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES, NULL, NULL);
  //synaptic_weights = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES * NO_NODES, NULL, NULL);
  out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL);
  if (!in_array || /*!synaptic_weights ||*/ !out_array)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return -1;
  }    
    
	ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode);

	if(ifp == NULL)
	{
		printf("Input file not found \n");
  		return -1;
	}
	while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) {

		in_array_tb[index++] = temp;
	}
	index = 0;
	temp = 0;

	//ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode);
	//if(ifp == NULL)
	//{
	//	printf("Weight file not found \n");
  	//	return -1;
	//}
	//while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) {
	//	synaptic_weights_tb[index++] = temp;
	//}
   
  //
  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array a!\n");
    printf("Test failed\n");
    return -1;
  }

  // Write our data set into the input array in device memory 
  //
  //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) *  NO_NODES *  NO_NODES, synaptic_weights_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array b!\n");
    printf("Test failed\n");
    return -1;
  }
    
  // Set the arguments to our compute kernel
  //
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array);
  //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);

  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  clWaitForEvents(1, &readevent);
    
  //printf("A\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",a[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("B\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",b[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("res\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
    
  // Validate our results
  //
  //correct = 0;
  //for(i = 0; i < DATA_SIZE; i++)
  //{
  //  int row = i/MATRIX_RANK;
  //  int col = i%MATRIX_RANK;
  //  int running = 0;
  //  int index;
  //  for (index=0;index<MATRIX_RANK;index++) {
  //    int aIndex = row*MATRIX_RANK + index;
  //    int bIndex = col + index*MATRIX_RANK;
  //    running += a[aIndex] * b[bIndex];
  //  }
  //  sw_results[i] = running;
  //}
  //  
  //for (i = 0;i < DATA_SIZE; i++) 
  //  if(results[i] == sw_results[i])
  //    correct++;
  //printf("Software\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  //printf("%0.2f ",sw_results[i]);
  //  printf("%d ",sw_results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //  
  //  
  //// Print a brief summary detailing the results
  ////
  //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE);
  //  
        
  // Shutdown and cleanup
	int temp_ = 0;


 for (j = 0; j < NO_NODES; j++)
 {
 	if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0)
 	{
 		//printf("out_array[%d] = %f \n", j, out_array[j]);
 		temp_++;
 	}
 }


  clReleaseMemObject(in_array);
  //clReleaseMemObject(synaptic_weights);
  clReleaseMemObject(out_array);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

	if (temp_ == NO_NODES)
	{
		printf("*********************************************************** \n");
		printf("TEST PASSED !!!!!! The output matches the desired output. \n");
		printf("*********************************************************** \n");
		return EXIT_SUCCESS;
	}
	else
	{
		printf("**************************************************************** \n");
		printf("TEST Failed !!!!!! The output does not match the desired output. \n");
		printf("**************************************************************** \n");
		return -1;
	}

  //if(correct == DATA_SIZE){
  //  printf("Test passed!\n");
  //  return EXIT_SUCCESS;
  //}
  //else{
  //  printf("Test failed\n");
  //  return -1;
  //}
}
Esempio n. 2
0
int
buildProgramFromAmdBin(unsigned int platform_id,unsigned int dev_id,char *binFile)
{
    int i = 0;
    cl_int err = CL_SUCCESS;

    cl_int nPlatforms = 0;
    cl_platform_id *platforms = NULL;
    cl_platform_id platform = (cl_platform_id)NULL;
    cl_context_properties cprops[3];
    cl_context context;
    size_t nDevices = 0;
    cl_device_id devices[MAXGPUS];
    cl_device_id device_id = 0;
    size_t binary_size = 0;
    char * binary = NULL;
    cl_program program = NULL;
    char pbuf[100];
    cl_command_queue cmdq;
    cl_mem iBuf,oBuf;
    cl_kernel kernel;
    cl_int *inBuf,*outBuf;
    inBuf=(cl_int*)malloc(MAX_THREADS*sizeof(cl_int));
    outBuf=(cl_int*)malloc(MAX_THREADS*sizeof(cl_int));
    size_t N=MAX_THREADS;
    cl_event evnt;
    char buildOptions[200];
    char opencl_log[1024*64];

    /* figure out the number of platforms on this system. */
    err = clGetPlatformIDs( 0, NULL, &nPlatforms );
    checkErr( "clGetPlatformIDs", err );
    printf( "Number of platforms found: %d\n", nPlatforms );
    if( nPlatforms == 0 )
    {
        fprintf( stderr, "Cannot continue without any platforms. Exiting.\n" );
        return( -1 );
    }
    platforms = (cl_platform_id *)malloc( sizeof(cl_platform_id) * nPlatforms );
    err = clGetPlatformIDs( nPlatforms, platforms, NULL );
    checkErr( "clGetPlatformIDs", err );

    /* Check for AMD platform. */

    err = clGetPlatformInfo( platforms[platform_id], CL_PLATFORM_VENDOR,
                             sizeof(pbuf), pbuf, NULL );
    checkErr( "clGetPlatformInfo", err );
    if( strcmp(pbuf, "Advanced Micro Devices, Inc.") == 0 )
    {
        printf( "Found AMD platform\n" );
        platform = platforms[platform_id];

    }

    if( platform == (cl_context_properties)0 )
    {
        fprintf( stderr, "Could not find an AMD platform. Exiting.\n" );
        exit(0);
    }

    clGetDeviceIDs(platform,
                   CL_DEVICE_TYPE_ALL,MAXGPUS, devices, &nDevices);

    cprops[0] = CL_CONTEXT_PLATFORM;
    cprops[1] = (cl_context_properties)platform;
    cprops[2] = (cl_context_properties)NULL;

    context =   clCreateContext(cprops, 1, &devices[dev_id], NULL, NULL,
                                &err);
    checkErr( "clCreateContext", err );

    printDeviceName(dev_id,devices[dev_id]);

    /* read in the binary kernel. */
    binary = readKernelBin( &binary_size, binFile );

    /* create an OpenCL program from the binary kernel. */
    program = clCreateProgramWithBinary( context, 1, &devices[dev_id], &binary_size,
                                         (const unsigned char**)&binary, NULL, &err );
    checkErr( "clCreateProgramWithBinary", err );

    sprintf(buildOptions,"%s %s",OCL_BINARY_OPTIONS ,OCL_OPTIMIZATIONS);

    /* build the kernel source for all available devices in the context. */
    err = clBuildProgram( program, 0, NULL,buildOptions , NULL, NULL );

    checkErr("clGetProgramBuildInfo",clGetProgramBuildInfo(program, devices[dev_id],
             CL_PROGRAM_BUILD_LOG, sizeof(opencl_log), (void *) opencl_log,
             NULL));

    /*Report build errors and warnings*/
    if (err != CL_SUCCESS)
    {   fprintf(stderr, "Compilation log: %s\n", opencl_log);
        exit(0);
    }
#ifdef REPORT_OPENCL_WARNINGS
    else if (strlen(opencl_log) > 1)
        fprintf(stderr, "Compilation log: %s\n", opencl_log);
#endif


    /* IT IS APPLICATION-DEPENDENT WHAT TO DO AFTER THIS POINT. */
    printf( "\n*** REPLACE THIS WITH ACTUAL WORK ***\n" );

    for(i=0; i<MAX_THREADS; i++)
        inBuf[i]=i;

    kernel=clCreateKernel(program,"test",&err) ;

    if(err) {
        printf("Create Kernel test FAILED\n");
        return 0;
    }

    cmdq=clCreateCommandQueue(context, devices[dev_id], CL_QUEUE_PROFILING_ENABLE,&err);
    checkErr("Create CMDQ FAILED\n",err);

    iBuf=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,MAX_THREADS*sizeof(cl_int),inBuf,&err);
    if((iBuf==(cl_mem)0)) {
        checkErr("Create Buffer FAILED\n",err);
    }

    oBuf=clCreateBuffer(context,CL_MEM_WRITE_ONLY,MAX_THREADS*sizeof(cl_int),NULL,&err);
    if((oBuf==(cl_mem)0)) {
        checkErr("Create Buffer FAILED\n",err);
    }


    checkErr("Set Kernel Arg FAILED arg0\n",clSetKernelArg(kernel,0,sizeof(cl_mem),&iBuf));

    checkErr("Set Kernel Arg FAILED arg1\n",clSetKernelArg(kernel,1,sizeof(cl_mem),&oBuf));

    err=clEnqueueNDRangeKernel(cmdq,kernel,1,NULL,&N,NULL,0,NULL,&evnt);

    clWaitForEvents(1,&evnt);

    checkErr("Write FAILED\n",clEnqueueReadBuffer(cmdq,oBuf,CL_TRUE,0,MAX_THREADS*sizeof(cl_uint),outBuf, 0, NULL, NULL));

    for(i=0; i<MAX_THREADS; i++)
        printf("%d\n",outBuf[i]);

    return (0);
}
int 
SimpleImage::setupCL()
{
    cl_int status = CL_SUCCESS;

#if 0

    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
    }

    size_t deviceListSize;

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetPlatformIDs failed."))
    {
        return SDK_FAILURE;
    }
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetPlatformIDs failed."))
        {
            return SDK_FAILURE;
        }
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);

            if(!sampleCommon->checkVal(status,
                                       CL_SUCCESS,
                                       "clGetPlatformInfo failed."))
            {
                return SDK_FAILURE;
            }

            platform = platforms[i];
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        delete[] platforms;
    }

    if(NULL == platform)
    {
        sampleCommon->error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    // Display available devices.
    if(!sampleCommon->displayDevices(platform, dType))
    {
        sampleCommon->error("sampleCommon::displayDevices() failed");
        return SDK_FAILURE;
    }

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
        cps,
        dType,
        NULL,
        NULL,
        &status);

    if(!sampleCommon->checkVal(status,
        CL_SUCCESS,
        "clCreateContextFromType failed."))
    {
        return SDK_FAILURE;
    }

    /* First, get the size of device list data */
    status = clGetContextInfo(
        context, 
        CL_CONTEXT_DEVICES, 
        0, 
        NULL, 
        &deviceListSize);
    if(!sampleCommon->checkVal(
        status, 
        CL_SUCCESS,
        "clGetContextInfo failed."))
        return SDK_FAILURE;

    int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
    if(!sampleCommon->validateDeviceId(deviceId, deviceCount))
    {
        sampleCommon->error("sampleCommon::validateDeviceId() failed");
        return SDK_FAILURE;
    }

    /* Now allocate memory for device list based on the size we got earlier */
    devices = (cl_device_id*)malloc(deviceListSize);
    if(devices == NULL)
    {
        sampleCommon->error("Failed to allocate memory (devices).");
        return SDK_FAILURE;
    }

    /* Now, get the device list data */
    status = clGetContextInfo(
        context, 
        CL_CONTEXT_DEVICES, 
        deviceListSize, 
        devices, 
        NULL);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetContextInfo failed."))
        return SDK_FAILURE;

    /* Check for image support */
    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_IMAGE_SUPPORT,
                             sizeof(cl_bool),
                             &imageSupport,
                             0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo failed."))
        return SDK_FAILURE;

    if(!imageSupport)
    {
        std::cout << "Error : Images are not supported on this device!\n";
        return SDK_EXPECTED_FAILURE;
    }
    /* Create command queue */

    cl_command_queue_properties prop = 0;

    if(timing)
        prop |= CL_QUEUE_PROFILING_ENABLE;

    commandQueue = clCreateCommandQueue(
        context,
        devices[deviceId],
        prop,
        &status);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateCommandQueue failed."))
    {
        return SDK_FAILURE;
    }

    /*
    * Create and initialize image objects
    */
    /* Create 2D input image */
    inputImage2D = clCreateImage2D(context,
                                   CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                   &imageFormat,
                                   width,
                                   height,
                                   0,
                                   inputImageData,
                                   &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (inputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* Create 2D output image */
    outputImage2D = clCreateImage2D(context,
                                   CL_MEM_WRITE_ONLY,
                                   &imageFormat,
                                   width,
                                   height,
                                   0,
                                   0,
                                   &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (inputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* Create 3D input image */
    inputImage3D = clCreateImage3D(context,
                                   CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                   &imageFormat,
                                   width,
                                   height / 2,  
                                   2,           //2 slices
                                   0,
                                   0,
                                   inputImageData,
                                   &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (inputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* Writes to 3D images not allowed in spec currently */
    outputImage3D = clCreateImage2D(context,
                                   CL_MEM_WRITE_ONLY,
                                   &imageFormat,
                                   width,
                                   height,
                                   0,
                                   0,
                                   &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (inputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    if(!sampleCommon->checkVal(status,
        CL_SUCCESS,
        "clCreateBuffer failed. (outputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* create a CL program using the kernel source */
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        const char * binary = kernelFile.source().c_str();
        size_t binarySize = kernelFile.source().size();
        program = clCreateProgramWithBinary(context,
                                            1,
                                            &devices[deviceId], 
                                            (const size_t *)&binarySize,
                                            (const unsigned char**)&binary,
                                            NULL,
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clCreateProgramWithBinary failed."))
        {
            return SDK_FAILURE;
        }

    }
    else
    {
        kernelPath.append("SimpleImage_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : "<< kernelPath << std::endl;
            return SDK_FAILURE;
        }
        const char *source = kernelFile.source().c_str();
        size_t sourceSize[] = {strlen(source)};
        program = clCreateProgramWithSource(context,
            1,
            &source,
            sourceSize,
            &status);
        if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateProgramWithSource failed."))
            return SDK_FAILURE;
    }

    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(
        program,
        1,
        &devices[deviceId],
        NULL,
        NULL,
        NULL);
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char *buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                &buildLogSize);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
                return SDK_FAILURE;

            buildLog = (char*)malloc(buildLogSize);
            if(buildLog == NULL)
            {
                sampleCommon->error("Failed to allocate host memory. (buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                NULL);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
            {
                free(buildLog);
                return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

        if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clBuildProgram failed."))
            return SDK_FAILURE;
    }

    /* get a kernel object handle for a kernel with the given name */
    kernel2D = clCreateKernel(program, "image2dCopy", &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateKernel failed."))
    {
        return SDK_FAILURE;
    }

    kernel3D = clCreateKernel(program, "image3dCopy", &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateKernel failed."))
    {
        return SDK_FAILURE;
    }

    /* Check group size against group size returned by kernel */
    status = clGetKernelWorkGroupInfo(kernel2D,
        devices[deviceId],
        CL_KERNEL_WORK_GROUP_SIZE,
        sizeof(size_t),
        &kernel2DWorkGroupSize,
        0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetKernelWorkGroupInfo  failed."))
    {
        return SDK_FAILURE;
    }

    /* Check group size against group size returned by kernel */
    status = clGetKernelWorkGroupInfo(kernel3D,
        devices[deviceId],
        CL_KERNEL_WORK_GROUP_SIZE,
        sizeof(size_t),
        &kernel3DWorkGroupSize,
        0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetKernelWorkGroupInfo  failed."))
    {
        return SDK_FAILURE;
    }

    cl_uint temp = (cl_uint)min(kernel2DWorkGroupSize, kernel3DWorkGroupSize);

    if((blockSizeX * blockSizeY) > temp)
    {
        if(!quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : "
                      << blockSizeX * blockSizeY << std::endl;
            std::cout << "Max Group Size supported on the kernel(s) : " 
                      << temp << std::endl;
            std::cout << "Falling back to " << temp << std::endl;
        }

        if(blockSizeX > temp)
        {
            blockSizeX = temp;
            blockSizeY = 1;
        }

    }

#endif

    return SDK_SUCCESS;
}
Esempio n. 4
0
_clState *initCl(unsigned int gpu, char *name, size_t nameSize)
{
	_clState *clState = calloc(1, sizeof(_clState));
	bool patchbfi = false, prog_built = false;
	cl_platform_id platform = NULL;
	char pbuff[256], vbuff[255];
	cl_platform_id* platforms;
	cl_device_id *devices;
	cl_uint numPlatforms;
	cl_uint numDevices;
	cl_int status;

	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)");
		return NULL;
	}

	platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
	status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)");
		return NULL;
	}

	if (opt_platform_id >= (int)numPlatforms) {
		applog(LOG_ERR, "Specified platform that does not exist");
		return NULL;
	}

	status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)");
		return NULL;
	}
	platform = platforms[opt_platform_id];

	if (platform == NULL) {
		perror("NULL platform found!\n");
		return NULL;
	}

	applog(LOG_INFO, "CL Platform vendor: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform name: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform version: %s", vbuff);

	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Getting Device IDs (num)");
		return NULL;
	}

	if (numDevices > 0 ) {
		devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

		/* Now, get the device list data */

		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Getting Device IDs (list)");
			return NULL;
		}

		applog(LOG_INFO, "List of devices:");

		unsigned int i;
		for (i = 0; i < numDevices; i++) {
			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error: Getting Device Info");
				return NULL;
			}

			applog(LOG_INFO, "\t%i\t%s", i, pbuff);
		}

		if (gpu < numDevices) {
			status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error: Getting Device Info");
				return NULL;
			}

			applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
			strncpy(name, pbuff, nameSize);
		} else {
			applog(LOG_ERR, "Invalid GPU %i", gpu);
			return NULL;
		}

	} else return NULL;

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };

	clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)");
		return NULL;
	}

	/* Check for BFI INT support. Hopefully people don't mix devices with
	 * and without it! */
	char * extensions = malloc(1024);
	const char * camo = "cl_amd_media_ops";
	char *find;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS");
		return NULL;
	}
	find = strstr(extensions, camo);
	if (find)
		clState->hasBitAlign = true;
		
	/* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
	char * devoclver = malloc(1024);
	const char * ocl10 = "OpenCL 1.0";

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION");
		return NULL;
	}
	find = strstr(devoclver, ocl10);
	if (!find)
		clState->hasOpenCL11plus = true;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT");
		return NULL;
	}
	applog(LOG_DEBUG, "Preferred vector width reported %d", clState->preferred_vwidth);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE");
		return NULL;
	}
	applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);

	/* For some reason 2 vectors is still better even if the card says
	 * otherwise, and many cards lie about their max so use 256 as max
	 * unless explicitly set on the command line. 79x0 cards perform
	 * better without vectors */
	if (clState->preferred_vwidth > 1) {
		if (strstr(name, "Tahiti"))
			clState->preferred_vwidth = 1;
		else
			clState->preferred_vwidth = 2;
	}

	if (opt_vectors)
		clState->preferred_vwidth = opt_vectors;
	if (opt_worksize && opt_worksize <= (int)clState->max_work_size)
		clState->work_size = opt_worksize;
	else
		clState->work_size = (clState->max_work_size <= 256 ? clState->max_work_size : 256) /
				clState->preferred_vwidth;

	/* Create binary filename based on parameters passed to opencl
	 * compiler to ensure we only load a binary that matches what would
	 * have otherwise created. The filename is:
	 * name + kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin
	 */
	char binaryfilename[255];
	char filename[255];
	char numbuf[10];

	if (chosen_kernel == KL_NONE) {
		if (strstr(name, "Tahiti") // GCN
		    || !clState->hasBitAlign // Older Radeon & Nvidia
		    || strstr(vbuff, "844.4") // Linux 64 bit ATI 2.6 SDK
		    || strstr(vbuff, "851.4") // Windows 64 bit ""
		    || strstr(vbuff, "831.4") // Windows & Linux 32 bit ""
		)
			clState->chosen_kernel = KL_POCLBM;
		else
			clState->chosen_kernel = KL_PHATK;
	} else
		clState->chosen_kernel = chosen_kernel;

	switch (clState->chosen_kernel) {
		case KL_POCLBM:
			strcpy(filename, POCLBM_KERNNAME".cl");
			strcpy(binaryfilename, POCLBM_KERNNAME);
			break;
		case KL_NONE: /* Shouldn't happen */
		case KL_PHATK:
			strcpy(filename, PHATK_KERNNAME".cl");
			strcpy(binaryfilename, PHATK_KERNNAME);
			break;
		case KL_DIAKGCN:
			strcpy(filename, DIAKGCN_KERNNAME".cl");
			strcpy(binaryfilename, DIAKGCN_KERNNAME);
			break;
		case KL_DIABLO:
			strcpy(filename, DIABLO_KERNNAME".cl");
			strcpy(binaryfilename, DIABLO_KERNNAME);
			break;
	}

	FILE *binaryfile;
	size_t *binary_sizes;
	char **binaries;
	int pl;
	char *source = file_contents(filename, &pl);
	size_t sourceSize[] = {(size_t)pl};
	cl_uint slot, cpnd;

	slot = cpnd = 0;

	if (!source)
		return NULL;

	binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binary_sizes)) {
		applog(LOG_ERR, "Unable to calloc binary_sizes");
		return NULL;
	}
	binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binaries)) {
		applog(LOG_ERR, "Unable to calloc binaries");
		return NULL;
	}

	strcat(binaryfilename, name);
	if (clState->hasBitAlign)
		strcat(binaryfilename, "bitalign");

	strcat(binaryfilename, "v");
	sprintf(numbuf, "%d", clState->preferred_vwidth);
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, "w");
	sprintf(numbuf, "%d", (int)clState->work_size);
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, "long");
	sprintf(numbuf, "%d", (int)sizeof(long));
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, ".bin");

loadbin:
	binaryfile = fopen(binaryfilename, "rb");
	if (!binaryfile) {
		applog(LOG_DEBUG, "No binary found, generating from source");
	} else {
		struct stat binary_stat;

		if (unlikely(stat(binaryfilename, &binary_stat))) {
			applog(LOG_DEBUG, "Unable to stat binary, generating from source");
			fclose(binaryfile);
			goto build;
		}
		if (!binary_stat.st_size)
			goto build;

		binary_sizes[slot] = binary_stat.st_size;
		binaries[slot] = (char *)calloc(binary_sizes[slot], 1);
		if (unlikely(!binaries[slot])) {
			applog(LOG_ERR, "Unable to calloc binaries");
			fclose(binaryfile);
			return NULL;
		}

		if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) {
			applog(LOG_ERR, "Unable to fread binaries");
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		clRetainProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
			return NULL;
		}

		fclose(binaryfile);
		applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);

		goto built;
	}

	/////////////////////////////////////////////////////////////////
	// Load CL file, build CL program object, create CL kernel object
	/////////////////////////////////////////////////////////////////

build:
	/* If no binary is available, and we have a card that suffers with phatk
	 * on SDK2.6, use the poclbm kernel instead if one has not been
	 * selected. */
	if (clState->chosen_kernel != KL_POCLBM && chosen_kernel == KL_NONE &&
		!strstr(name, "Tahiti") && clState->hasBitAlign &&
		(strstr(vbuff, "844.4") /* Linux 64 bit ATI 2.6 SDK */	||
		 strstr(vbuff, "851.4") /* Windows 64 bit "" */		||
		 strstr(vbuff, "831.4") /* Windows & Linux 32 bit "" */ )) {
			applog(LOG_WARNING, "SDK 2.6 detected, using poclbm kernel");
			clState->chosen_kernel = KL_POCLBM;
			strcpy(filename, POCLBM_KERNNAME".cl");
			strcpy(binaryfilename, POCLBM_KERNNAME);
			strcat(binaryfilename, name);
			strcat(binaryfilename, "bitalign");
			strcat(binaryfilename, "v");
			sprintf(numbuf, "%d", clState->preferred_vwidth);
			strcat(binaryfilename, numbuf);
			strcat(binaryfilename, "w");
			sprintf(numbuf, "%d", (int)clState->work_size);
			strcat(binaryfilename, numbuf);
			strcat(binaryfilename, "long");
			sprintf(numbuf, "%d", (int)sizeof(long));
			strcat(binaryfilename, numbuf);
			strcat(binaryfilename, ".bin");

			goto loadbin;
	}

	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithSource)");
		return NULL;
	}

	clRetainProgram(clState->program);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
		return NULL;
	}

	/* create a cl program executable for all the devices specified */
	char *CompilerOptions = calloc(1, 256);

	sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d",
		(int)clState->work_size, clState->preferred_vwidth);
	applog(LOG_DEBUG, "Setting worksize to %d", clState->work_size);
	if (clState->preferred_vwidth > 1)
		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->preferred_vwidth);

	if (clState->hasBitAlign) {
		strcat(CompilerOptions, " -D BITALIGN");
		applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
		if (strstr(name, "Cedar") ||
		    strstr(name, "Redwood") ||
		    strstr(name, "Juniper") ||
		    strstr(name, "Cypress" ) ||
		    strstr(name, "Hemlock" ) ||
		    strstr(name, "Caicos" ) ||
		    strstr(name, "Turks" ) ||
		    strstr(name, "Barts" ) ||
		    strstr(name, "Cayman" ) ||
		    strstr(name, "Antilles" ) ||
		    strstr(name, "Wrestler" ) ||
		    strstr(name, "Zacate" ) ||
		    strstr(name, "WinterPark" ) ||
		    strstr(name, "BeaverCreek" ))
			patchbfi = true;
	} else
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");

	if (patchbfi) {
		strcat(CompilerOptions, " -D BFI_INT");
		applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
	} else
		applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");

	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
	status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);
	free(CompilerOptions);

	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Building Program (clBuildProgram)");
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_INFO, "%s", log);
		return NULL;
	}

	prog_built = true;

	status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)");
		return NULL;
	}

	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)");
		return NULL;
	}

	/* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
	 * to iterate over all the binary slots and find where the real program
	 * is. What the heck is this!? */
	for (slot = 0; slot < cpnd; slot++)
		if (binary_sizes[slot])
			break;

	/* copy over all of the generated binaries. */
	applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, binary_sizes[slot]);
	if (!binary_sizes[slot]) {
		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
		return NULL;
	}
	binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1);
	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)");
		return NULL;
	}

	/* Patch the kernel if the hardware supports BFI_INT but it needs to
	 * be hacked in */
	if (patchbfi) {
		unsigned remaining = binary_sizes[slot];
		char *w = binaries[slot];
		unsigned int start, length;

		/* Find 2nd incidence of .text, and copy the program's
		* position and length at a fixed offset from that. Then go
		* back and find the 2nd incidence of \x7ELF (rewind by one
		* from ELF) and then patch the opcocdes */
		if (!advance(&w, &remaining, ".text"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, ".text")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		memcpy(&start, w + 285, 4);
		memcpy(&length, w + 289, 4);
		w = binaries[slot]; remaining = binary_sizes[slot];
		if (!advance(&w, &remaining, "ELF"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, "ELF")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		w--; remaining++;
		w += start; remaining -= start;
		applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
			w, remaining);
		patch_opcodes(w, length);

		status = clReleaseProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Releasing program. (clReleaseProgram)");
			return NULL;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
			return NULL;
		}

		clRetainProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
			return NULL;
		}

		/* Program needs to be rebuilt */
		prog_built = false;
	}

	free(source);

	/* Save the binary to be loaded next time */
	binaryfile = fopen(binaryfilename, "wb");
	if (!binaryfile) {
		/* Not a fatal problem, just means we build it again next time */
		applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
	} else {
		if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) {
			applog(LOG_ERR, "Unable to fwrite to binaryfile");
			return NULL;
		}
		fclose(binaryfile);
	}
built:
	if (binaries[slot])
		free(binaries[slot]);
	free(binaries);
	free(binary_sizes);

	applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d",
	       filename, clState->hasBitAlign ? "" : "out", clState->preferred_vwidth, clState->work_size);

	if (!prog_built) {
		/* create a cl program executable for all the devices specified */
		status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Building Program (clBuildProgram)");
			size_t logSize;
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

			char *log = malloc(logSize);
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
			applog(LOG_INFO, "%s", log);
			return NULL;
		}

		clRetainProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
			return NULL;
		}
	}

	/* get a kernel object handle for a kernel with the given name */
	clState->kernel = clCreateKernel(clState->program, "search", &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)");
		return NULL;
	}

	/////////////////////////////////////////////////////////////////
	// Create an OpenCL command queue
	/////////////////////////////////////////////////////////////////
	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
	if (status != CL_SUCCESS) /* Try again without OOE enable */
		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)");
		return NULL;
	}

	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)");
		return NULL;
	}

	return clState;
}
Esempio n. 5
0
void run_delaunay(cl_device_id device, bool is_cpu)
{
  curr_device = device;

  /* 
    Open Input Points 
  */
  scll points = open_points("inputpoints.bin");
  int num_points = points->count;  
   
  /* 
    Create Context
  */
  int error_code;
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &error_code);
  if(error_code != 0)
    {
      printf("clCreateContext error code = %d\n", error_code);
      goto ExitFunction;
    }

  /*
    Create Command Queue
  */
  cl_command_queue_properties properties = 0;
  command_queue = clCreateCommandQueue(context, device, properties, &error_code);
  if(error_code != 0)
    {
      printf("clCreateCommandQueue error code ret=%d\n", error_code);
      goto ReleaseContext;
    }

  char * program_name;
  if(is_cpu)
    program_name = "cpu_kernel";
  else
    program_name = "cell_kernel";

  /*
    Open Program
  */
  size_t binary_length;
  unsigned char * binary;  
  OpenProgramBinary(program_name, &binary_length, &binary);
  cl_program program;
  cl_int binary_status;
  program = clCreateProgramWithBinary(context, 1, &device, &binary_length, (const unsigned char **) &binary, &binary_status, &error_code);
  if(error_code != 0)
    {
      printf("clCreateProgramWithBinary error code = %d\n", error_code);
      goto ReleaseCommandQueue;
    }

  char * kernel_name;
  if(is_cpu)
    kernel_name = "InCircle";
  else
    kernel_name = "cell_function";

  /*
    Open Kernel
  */
  kernel = clCreateKernel(program, kernel_name, &error_code);
  if(error_code != 0)
    {
      printf("clCreateKernel error code = %d\n", error_code);
      goto ReleaseProgram;
    }

  /* 
    Create buffers
  */
  points_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, points->alloc_size, NULL, &error_code);
  if(error_code != 0)
    {
      printf("clCreateBuffer (points_mem) error code = %d\n", error_code);
      goto ReleaseKernel;
    }

  scll triangles = delaunay_core(points, num_points, is_cpu);
  //save_triangles("triangles.txt", triangles);

  clReleaseMemObject(points_mem);
ReleaseKernel:
  clReleaseKernel(kernel);
ReleaseProgram:
  clReleaseProgram(program);
ReleaseCommandQueue:
  clReleaseCommandQueue(command_queue);
ReleaseContext:
  clReleaseContext(context);
ExitFunction:
  return;
}
Esempio n. 6
0
cl_program
piglit_cl_build_program_with_binary_extended(piglit_cl_context context,
                                             size_t* lenghts,
                                             unsigned char** binaries,
                                             const char* options, bool fail)
{
	cl_int errNo;
	cl_program program;

	cl_int* binary_status = malloc(sizeof(cl_int) * context->num_devices);

	program = clCreateProgramWithBinary(context->cl_ctx,
	                                    context->num_devices,
	                                    context->device_ids,
	                                    lenghts,
	                                    (const unsigned char**)binaries,
	                                    binary_status,
	                                    &errNo);
	if(errNo != CL_SUCCESS) {
		int i;

		fprintf(stderr,
		        "Could not create program with binary: %s\n",
		        piglit_cl_get_error_name(errNo));

		printf("Create error with binaries:\n");
		for(i = 0; i < context->num_devices; i++) {
			char* device_name = piglit_cl_get_device_info(context->device_ids[i],
			                                              CL_DEVICE_NAME);
			
			printf("Error for %s: %s\n",
			       device_name,
			       piglit_cl_get_error_name(binary_status[i]));
			
			free(device_name);
		}

		free(binary_status);
		return NULL;
	}
	free(binary_status);
	
	errNo = clBuildProgram(program,
	                       context->num_devices,
	                       context->device_ids,
	                       options,
	                       NULL,
	                       NULL);
	if(   (!fail && errNo != CL_SUCCESS)
	   || ( fail && errNo == CL_SUCCESS)) {
		int i;

		fprintf(stderr,
		        !fail ? "Could not build program: %s\n"
		              : "Program built when it should have failed: %s\n",
		        piglit_cl_get_error_name(errNo));

		printf("Build log for binaries.\n");

		for(i = 0; i < context->num_devices; i++) {
			char* device_name = piglit_cl_get_device_info(context->device_ids[i],
			                                              CL_DEVICE_NAME);
			char* log = piglit_cl_get_program_build_info(program,
			                                             context->device_ids[i],
			                                             CL_PROGRAM_BUILD_LOG);
			
			printf("Build log for device %s:\n -------- \n%s\n -------- \n",
			       device_name,
			       log);
			
			free(device_name);
			free(log);
		}

		clReleaseProgram(program);
		return NULL;
	}

	return program;
}
int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs)
{
	unsigned int dev;
	unsigned int nb_devices;

	nb_devices = _starpu_opencl_get_device_count();
	// Iterate over each device
	for(dev = 0; dev < nb_devices; dev ++)
	{
		cl_device_id device;
		cl_context   context;
		cl_program   program;
		cl_int       err;
		char        *binary;
		char         binary_file_name[1024];
		size_t       length;
		cl_int       binary_status;

		opencl_programs->programs[dev] = NULL;

		starpu_opencl_get_device(dev, &device);
		starpu_opencl_get_context(dev, &context);
		if (context == NULL)
		{
			_STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev);
			continue;
		}

		// Load the binary buffer
		err = _starpu_opencl_get_binary_name(binary_file_name, 1024, kernel_id, dev, device);
		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
		binary = _starpu_opencl_load_program_binary(binary_file_name, &length);

		// Create the compute program from the binary buffer
		program = clCreateProgramWithBinary(context, 1, &device, &length, (const unsigned char **) &binary, &binary_status, &err);
		if (!program || err != CL_SUCCESS)
		{
			_STARPU_DISP("Error: Failed to load program binary!\n");
			return EXIT_FAILURE;
		}

		// Build the program executable
		err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

		// Get the status
		{
			cl_build_status status;
			size_t len;
			static char buffer[4096] = "";

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			if (len > 2)
				_STARPU_DISP("Compilation output\n%s\n", buffer);

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
			if (err != CL_SUCCESS || status != CL_BUILD_SUCCESS)
			{
				_STARPU_DISP("Error: Failed to build program executable!\n");
				_STARPU_DISP("clBuildProgram: %d - clGetProgramBuildInfo: %d\n", err, status);
				return EXIT_FAILURE;
			}
		}

		// Store program
		opencl_programs->programs[dev] = program;
	}
	return 0;
}
Esempio n. 8
0
int
NBody::setupCL()
{
    cl_int status = CL_SUCCESS;

    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetPlatformIDs failed."))
    {
        return SDK_FAILURE;
    }
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetPlatformIDs failed."))
        {
            return SDK_FAILURE;
        }
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);

            if(!sampleCommon->checkVal(status,
                                       CL_SUCCESS,
                                       "clGetPlatformInfo failed."))
            {
                return SDK_FAILURE;
            }

            platform = platforms[i];
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        delete[] platforms;
    }

    if(NULL == platform)
    {
        sampleCommon->error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    // Display available devices.
    if(!sampleCommon->displayDevices(platform, dType))
    {
        sampleCommon->error("sampleCommon::displayDevices() failed");
        return SDK_FAILURE;
    }

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
        cps,
        dType,
        NULL,
        NULL,
        &status);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateContextFromType failed."))
    {
        return SDK_FAILURE;
    }

    size_t deviceListSize;

    /* First, get the size of device list data */
    status = clGetContextInfo(
        context, 
        CL_CONTEXT_DEVICES, 
        0, 
        NULL, 
        &deviceListSize);
    if(!sampleCommon->checkVal(
        status, 
        CL_SUCCESS,
        "clGetContextInfo failed."))
        return SDK_FAILURE;

    int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
    if(!sampleCommon->validateDeviceId(deviceId, deviceCount))
    {
        sampleCommon->error("sampleCommon::validateDeviceId() failed");
        return SDK_FAILURE;
    }

    /* Now allocate memory for device list based on the size we got earlier */
    devices = (cl_device_id*)malloc(deviceListSize);
    if(devices == NULL)
    {
        sampleCommon->error("Failed to allocate memory (devices).");
        return SDK_FAILURE;
    }

    /* Now, get the device list data */
    status = clGetContextInfo(
        context, 
        CL_CONTEXT_DEVICES, 
        deviceListSize, 
        devices, 
        NULL);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetContextInfo failed."))
        return SDK_FAILURE;


    /* Create command queue */

    commandQueue = clCreateCommandQueue(
        context,
        devices[deviceId],
        0,
        &status);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateCommandQueue failed."))
    {
        return SDK_FAILURE;
    }

    /* Get Device specific Information */
    status = clGetDeviceInfo(
        devices[deviceId],
        CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(size_t),
        (void*)&maxWorkGroupSize,
        NULL);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed."))
        return SDK_FAILURE;


    status = clGetDeviceInfo(
        devices[deviceId],
        CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
        sizeof(cl_uint),
        (void*)&maxDimensions,
        NULL);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed."))
        return SDK_FAILURE;


    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));

    status = clGetDeviceInfo(
        devices[deviceId],
        CL_DEVICE_MAX_WORK_ITEM_SIZES,
        sizeof(size_t) * maxDimensions,
        (void*)maxWorkItemSizes,
        NULL);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
        return SDK_FAILURE;


    status = clGetDeviceInfo(
        devices[deviceId],
        CL_DEVICE_LOCAL_MEM_SIZE,
        sizeof(cl_ulong),
        (void *)&totalLocalMemory,
        NULL);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed."))
        return SDK_FAILURE;


    /*
    * Create and initialize memory objects
    */

    /* Create memory objects for position */
    currPos = clCreateBuffer(
        context,
        CL_MEM_READ_WRITE,
        numBodies * sizeof(cl_float4),
        0,
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (oldPos)"))
    {
        return SDK_FAILURE;
    }

    /* Initialize position buffer */
    status = clEnqueueWriteBuffer(commandQueue,
                                  currPos,
                                  1,
                                  0,
                                  numBodies * sizeof(cl_float4),
                                  pos,
                                  0,
                                  0,
                                  0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clEnqueueWriteBuffer failed. (oldPos)"))
    {
        return SDK_FAILURE;
    }


    /* Create memory objects for position */
    newPos = clCreateBuffer(
        context,
        CL_MEM_READ_WRITE,
        numBodies * sizeof(cl_float4),
        0,
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (newPos)"))
    {
        return SDK_FAILURE;
    }

    /* Create memory objects for velocity */
    currVel = clCreateBuffer(
        context,
        CL_MEM_READ_WRITE,
        numBodies * sizeof(cl_float4),
        0,
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (oldVel)"))
    {
        return SDK_FAILURE;
    }

    /* Initialize velocity buffer */
    status = clEnqueueWriteBuffer(commandQueue,
                                  currVel,
                                  1,
                                  0,
                                  numBodies * sizeof(cl_float4),
                                  vel,
                                  0,
                                  0,
                                  0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clEnqueueWriteBuffer failed. (oldVel)"))
    {
        return SDK_FAILURE;
    }

    /* Create memory objects for velocity */
    newVel = clCreateBuffer(
        context,
        CL_MEM_READ_ONLY,
        numBodies * sizeof(cl_float4),
        0,
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (newVel)"))
    {
        return SDK_FAILURE;
    }

    /* create a CL program using the kernel source */
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        const char * binary = kernelFile.source().c_str();
        size_t binarySize = kernelFile.source().size();
        program = clCreateProgramWithBinary(context,
                                            1,
                                            &devices[deviceId], 
                                            (const size_t *)&binarySize,
                                            (const unsigned char**)&binary,
                                            NULL,
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clCreateProgramWithBinary failed."))
        {
            return SDK_FAILURE;
        }

    }
    else
    {
        kernelPath.append("NBody_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        const char * source = kernelFile.source().c_str();
        size_t sourceSize[] = { strlen(source) };
        program = clCreateProgramWithSource(context,
                                            1,
                                            &source,
                                            sourceSize,
                                            &status);
        if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateProgramWithSource failed."))
            return SDK_FAILURE;
        }

    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(
        program,
        1,
        &devices[deviceId],
        NULL,
        NULL,
        NULL);
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char * buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                &buildLogSize);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
                return SDK_FAILURE;

            buildLog = (char*)malloc(buildLogSize);
            if(buildLog == NULL)
            {
                sampleCommon->error("Failed to allocate host memory. (buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                NULL);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
            {
                free(buildLog);
                return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

        if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clBuildProgram failed."))
            return SDK_FAILURE;
    }

    /* get a kernel object handle for a kernel with the given name */
    kernel = clCreateKernel(
        program,
        "nbody_sim",
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateKernel failed."))
    {
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}
int
BoxFilterGLSeparable::setupCL()
{
    cl_int status = CL_SUCCESS;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
    }

    size_t deviceListSize;

    /*
    * Have a look at the available platforms and pick either
    * the AMD one if available or the system default.
    */

    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (CL_SUCCESS != status) {
        fputs("clGetPlatformIDs() failed", stderr);
        exit(-1);
    }
    if (0 < numPlatforms) {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if (CL_SUCCESS != status) {
            fputs("clGetPlatformIDs() failed", stderr);
            exit(-1);
        }
        for (unsigned i = 0; i < numPlatforms; ++i) {
            char pbuf[100];
            status = clGetPlatformInfo(
                         platforms[i],
                         CL_PLATFORM_VENDOR,
                         sizeof(pbuf),
                         pbuf,
                         NULL);
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
                platform = platforms[i];
                break;
            }
        }
        delete platforms;
    }

    if(NULL == platform)
    {
        sampleCommon->error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    // Display available devices.
    if(!sampleCommon->displayDevices(platform, dType))
    {
        sampleCommon->error("sampleCommon::displayDevices() failed");
        return SDK_FAILURE;
    }

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
#ifdef _WIN32
    HGLRC glCtx = wglGetCurrentContext();
#else //!_WIN32
    GLXContext glCtx = glXGetCurrentContext();
#endif //!_WIN32

    cl_context_properties cpsGL[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
#ifdef _WIN32
        CL_WGL_HDC_KHR,
        (intptr_t)wglGetCurrentDC(),
#else //!_WIN32
        CL_GLX_DISPLAY_KHR,
        (intptr_t)glXGetCurrentDisplay(),
#endif //!_WIN32
        CL_GL_CONTEXT_KHR,
        (intptr_t)glCtx,
        0
    };

    context = clCreateContextFromType(cpsGL,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateContextFromType failed."))
    {
        return SDK_FAILURE;
    }

    /* First, get the size of device list data */
    status = clGetContextInfo(
                 context,
                 CL_CONTEXT_DEVICES,
                 0,
                 NULL,
                 &deviceListSize);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clGetContextInfo failed."))
        return SDK_FAILURE;

    int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
    if(!sampleCommon->validateDeviceId(deviceId, deviceCount))
    {
        sampleCommon->error("sampleCommon::validateDeviceId() failed");
        return SDK_FAILURE;
    }

    /* Now allocate memory for device list based on the size we got earlier */
    devices = (cl_device_id*)malloc(deviceListSize);
    if(devices == NULL)
    {
        sampleCommon->error("Failed to allocate memory (devices).");
        return SDK_FAILURE;
    }

    /* Now, get the device list data */
    status = clGetContextInfo(
                 context,
                 CL_CONTEXT_DEVICES,
                 deviceListSize,
                 devices,
                 NULL);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clGetContextInfo failed."))
        return SDK_FAILURE;

    /* Create command queue */

    cl_command_queue_properties prop = 0;

    if(timing)
        prop |= CL_QUEUE_PROFILING_ENABLE;

    commandQueue = clCreateCommandQueue(
                       context,
                       devices[deviceId],
                       prop,
                       &status);

    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clCreateCommandQueue failed."))
    {
        return SDK_FAILURE;
    }

    /*
     * Create texture object
     */
    glGenTextures(1, &tex);
    glBindTexture(GL_TEXTURE_2D, tex);

    /* Set parameters */
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glTexImage2D(GL_TEXTURE_2D, 0,  GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0);
    glBindTexture(GL_TEXTURE_2D, 0);

    /*
     * Create pixel-buffer object
     */
    glGenBuffers(1, &pbo);
    glBindBuffer(GL_ARRAY_BUFFER, pbo);

    // initialize buffer object
    unsigned int size = width * height * sizeof(cl_uchar4);

    // buffer data
    glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
    glBindBuffer(GL_ARRAY_BUFFER, 0);


    /* Create OpenCL buffer from GL PBO */
    outputImageBuffer = clCreateFromGLBuffer(context,
                        CL_MEM_WRITE_ONLY,
                        pbo,
                        &status);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clCreateFromGLBuffer failed. (outputImageBuffer)"))
        return SDK_FAILURE;

    /*
    * Create and initialize memory objects
    */

    /* Create memory object for input Image */
    inputImageBuffer = clCreateBuffer(
                           context,
                           CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                           width * height * pixelSize,
                           inputImageData,
                           &status);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clCreateBuffer failed. (inputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* Create memory object for temp Image */
    tempImageBuffer = clCreateBuffer(
                          context,
                          CL_MEM_READ_WRITE,
                          width * height * pixelSize,
                          0,
                          &status);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clCreateBuffer failed. (tempImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* create a CL program using the kernel source */
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        const char * binary = kernelFile.source().c_str();
        size_t binarySize = kernelFile.source().size();
        program = clCreateProgramWithBinary(context,
                                            1,
                                            &devices[deviceId],
                                            (const size_t *)&binarySize,
                                            (const unsigned char**)&binary,
                                            NULL,
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clCreateProgramWithBinary failed."))
        {
            return SDK_FAILURE;
        }

    }
    else
    {
        kernelPath.append("BoxFilterGL_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        const char *source = kernelFile.source().c_str();
        size_t sourceSize[] = {strlen(source)};
        program = clCreateProgramWithSource(context,
                                            1,
                                            &source,
                                            sourceSize,
                                            &status);
        if(!sampleCommon->checkVal(
                    status,
                    CL_SUCCESS,
                    "clCreateProgramWithSource failed."))
            return SDK_FAILURE;
    }

    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(
                 program,
                 1,
                 &devices[deviceId],
                 NULL,
                 NULL,
                 NULL);
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char *buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo (program,
                                               devices[deviceId],
                                               CL_PROGRAM_BUILD_LOG,
                                               buildLogSize,
                                               buildLog,
                                               &buildLogSize);
            if(!sampleCommon->checkVal(
                        logStatus,
                        CL_SUCCESS,
                        "clGetProgramBuildInfo failed."))
                return SDK_FAILURE;

            buildLog = (char*)malloc(buildLogSize);
            if(buildLog == NULL)
            {
                sampleCommon->error("Failed to allocate host memory.(buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo (program,
                                               devices[deviceId],
                                               CL_PROGRAM_BUILD_LOG,
                                               buildLogSize,
                                               buildLog,
                                               NULL);
            if(!sampleCommon->checkVal(
                        logStatus,
                        CL_SUCCESS,
                        "clGetProgramBuildInfo failed."))
            {
                free(buildLog);
                return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

        if(!sampleCommon->checkVal(
                    status,
                    CL_SUCCESS,
                    "clBuildProgram failed."))
            return SDK_FAILURE;
    }
    /* get a kernel object handle for a kernel with the given name */
    verticalKernel = clCreateKernel(program,
                                    "box_filter_vertical",
                                    &status);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateKernel failed. (vertical)"))
    {
        return SDK_FAILURE;
    }
#ifdef USE_LDS
    horizontalKernel = clCreateKernel(program,
                                      "box_filter_horizontal_local",
                                      &status);
#else
    horizontalKernel = clCreateKernel(program,
                                      "box_filter_horizontal",
                                      &status);
#endif
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateKernel failed. (horizontal)"))
    {
        return SDK_FAILURE;
    }

    /* Check group size against group size returned by kernel */
    status = clGetKernelWorkGroupInfo(verticalKernel,
                                      devices[deviceId],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clGetKernelWorkGroupInfo  failed."))
    {
        return SDK_FAILURE;
    }

    if((blockSizeX * blockSizeY) > kernelWorkGroupSize)
    {
        if(!quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : "
                      << blockSizeX * blockSizeY << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << kernelWorkGroupSize << std::endl;
            std::cout << "Falling back to " << kernelWorkGroupSize << std::endl;
        }

        /* Three possible cases */
        if(blockSizeX > kernelWorkGroupSize)
        {
            blockSizeX = kernelWorkGroupSize;
            blockSizeY = 1;
        }
    }

    return SDK_SUCCESS;
}
Esempio n. 10
0
struct cl_package initFPGA( const char* xclbin, const char* kernel_name )
{
	/*****************************************/
	/* Initialize OpenCL */
	/*****************************************/

	// Retrieve the number of platforms
    cl_uint numPlatforms = 0;
    cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);

	//printf("Found %d platforms support OpenCL, return code %d.\n", numPlatforms, status);
 
    // Allocate enough space for each platform
    cl_platform_id *platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id));
 
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS)
		printf("clGetPlatformIDs error(%d)\n", status);
	
	// Retrieve the number of devices
    cl_uint numDevices = 0;
#ifndef FPGA_DEVICE
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
#else
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices);
#endif
	printf("Found %d devices support OpenCL.\n", numDevices);

    // Allocate enough space for each device
    cl_device_id *devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id));

    // Fill in the devices 
#ifndef FPGA_DEVICE
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
#else
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL);
#endif
	
	if (status != CL_SUCCESS)
		printf("clGetDeviceIDs error(%d)\n", status);

    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
	if (status != CL_SUCCESS)
		printf("clCreateContext error(%d)\n", status);


	//Create a command-queue
	cl_command_queue clCommandQue = clCreateCommandQueue(context, devices[0], 0, &status);

	if (status != CL_SUCCESS)
		printf("clCreateCommandQueue error(%d)\n", status);

	// 6. Load and build OpenCL kernel
	
#ifndef FPGA_DEVICE
	// Create a program with source code
    cl_program program = clCreateProgramWithSource(context, 1, 
        (const char**)&logistic_cl, NULL, &status);
	if (status != 0)
		printf("clCreateProgramWithSource error(%d)\n", status);

    // Build (compile) the program for the device
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
#else
	// Load binary from disk
	unsigned char *kernelbinary;
	printf("loading %s\n", xclbin);
	int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
	if (n_i < 0) {
		printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin);
		exit(1);
	}
	size_t n_bit = n_i;

	// Create the compute program from offline
	cl_program program = clCreateProgramWithBinary(context, 1, &devices[0], &n_bit,
			(const unsigned char **) &kernelbinary, NULL, &status);
	if ((!program) || (status != CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n", status);
		exit(1);
	}

	// Build the program executable
	status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
#endif

	if (status != 0) {
		char errmsg[2048];
		size_t sizemsg = 0;

		status = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 2048*sizeof(char), errmsg, &sizemsg);

		printf("clBuildProgram error(%d)\n", status);
		printf("Compilation messages: \n %s", errmsg);
	}

	cl_kernel clKernel = clCreateKernel(program, kernel_name, &status);
	if (status != CL_SUCCESS)
		printf("clCreateKernel error(%d)\n", status);

	// TODO: parameterize the size of buffers
	cl_mem d_gradient = clCreateBuffer(context, CL_MEM_READ_WRITE, FEATURE_SIZE*LABEL_SIZE*GROUP_SIZE*sizeof(float), NULL, &status);
	if (status != CL_SUCCESS)
		printf("d_gradient clCreateBuffer error(%d)\n", status);

	cl_mem d_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, FEATURE_SIZE*LABEL_SIZE*sizeof(float), NULL, &status);
	if (status != CL_SUCCESS)
		printf("d_weights clCreateBuffer error(%d)\n", status);

	cl_mem d_data = clCreateBuffer(context, CL_MEM_READ_ONLY, (FEATURE_SIZE+LABEL_SIZE)*CHUNK_SIZE*sizeof(float), NULL, &status);
	if (status != CL_SUCCESS)
		printf("d_data clCreateBuffer error(%d)\n", status);

    struct cl_package result;
    result.context = context;
    result.kernel = clKernel;
    result.commandQueue = clCommandQue;
    result.d_gradient = d_gradient;
    result.d_weights = d_weights;
    result.d_data = d_data;

    return result;
}
Esempio n. 11
0
pclu_program* 
pclu_create_program(pclu_context* pclu, const char* path)
{
    int errcode;
    
    pclu_program* pgm = (pclu_program*) malloc(sizeof(pclu_program));
    pgm->pclu      = pclu;
    pgm->build_log = 0;

#define LOAD_BINS 1

#if LOAD_BINS

    const char* binary = (const char*) pclu_slurp_file("fmma.ptx");
    size_t size = strlen(binary);

    const unsigned char** bins = (const unsigned char**) binary;
    int status;

    pgm->program = clCreateProgramWithBinary(pclu->context, 1, &(pclu->device), 
            &size, bins, &status, &errcode);

    pclu_check_call("clCreateProgramWithBinary", errcode);
    pclu_check_call("clCreateProgramWithBinary status", status);

#else

    /* Read the source from disk */
    char* source = pclu_slurp_file(path);
    size_t  size = strlen(source);

    const char** sources = (const char**) &source;

    pgm->program = clCreateProgramWithSource(pclu->context, 1, sources, &size, &errcode);
    pclu_check_call("clCreateProgramWithSource", errcode);

    free(source);

    /* Compile for the device */
    errcode = clBuildProgram(pgm->program, 1, &(pclu->device), "", 0, 0);

    /* Print out errors on failure */
    if (errcode == CL_BUILD_PROGRAM_FAILURE) {
        size_t log_size;
        char*  log_text;

        pclu_check_call("clGetProgramBuildInfo", 
                clGetProgramBuildInfo(
                    pgm->program, pclu->device, CL_PROGRAM_BUILD_LOG, 0, 0, &log_size));

        log_text = (char*) alloca(log_size);

        pclu_check_call("clGetProgramBuildInfo", 
                clGetProgramBuildInfo(
                    pgm->program, pclu->device, CL_PROGRAM_BUILD_LOG, 
                    log_size, log_text, 0));

        fprintf(stderr, "Build Errors\n%s\n", log_text);
    }

    pclu_check_call("clBuildProgram", errcode);
#endif

#if DUMP_BINS
    /* Dump the Binaries */
    size_t bin_size;
    errcode = clGetProgramInfo(pgm->program, CL_PROGRAM_BINARY_SIZES, 
            sizeof(size_t), &bin_size, 0);
    pclu_check_call("clGetProgramInfo(BIN_SIZE)", errcode);

    cl_uchar* binary = (cl_uchar*) malloc(bin_size);
    errcode = clGetProgramInfo(pgm->program, CL_PROGRAM_BINARIES, bin_size, &binary, 0);
    pclu_check_call("clGetProgramInfo(BINARIES)", errcode);

    FILE* bf = fopen("opencl.bin", "w");
    fwrite((void*)binary, bin_size, 1, bf);
    fclose(bf);

    free(binary);
#endif

    /* Get the kernels */

    /*
    pclu_check_call("clCreateKernelsInProgram",
		    clCreateKernelsInProgram(pgm->program, 0, 0, &(pgm->num_kernels)));

    pgm->kernels = (cl_kernel*) malloc(pgm->num_kernels*sizeof(cl_kernel));
    pclu_check_call("clCreateKernelsInProgram",
		    clCreateKernelsInProgram(pgm->program, pgm->num_kernels, pgm->kernels, 0));
    */

    return pgm;
}
Esempio n. 12
0
File: ocl.c Progetto: furyan/cgminer
_clState *initCl(unsigned int gpu, char *name, size_t nameSize)
{
	int patchbfi = 0;
	cl_int status = 0;
	unsigned int i;

	_clState *clState = calloc(1, sizeof(_clState));

	cl_uint numPlatforms;
	cl_platform_id platform = NULL;
	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (status != CL_SUCCESS)
	{
		applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)");
		return NULL;
	}

	if (numPlatforms > 0)
	{
		cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id));
		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
		if (status != CL_SUCCESS)
		{
			applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)");
			return NULL;
		}

		for(i = 0; i < numPlatforms; ++i)
		{
			char pbuff[100];
			status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS)
			{
				applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)");
				free(platforms);
				return NULL;
			}
			platform = platforms[i];
			if (!strcmp(pbuff, "Advanced Micro Devices, Inc."))
			{
				break;
			}
		}
		free(platforms);
	}

	if (platform == NULL) {
		perror("NULL platform found!\n");
		return NULL;
	}

	size_t nDevices;
	cl_uint numDevices;
	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	if (status != CL_SUCCESS)
	{
		applog(LOG_ERR, "Error: Getting Device IDs (num)");
		return NULL;
	}

	cl_device_id *devices;
	if (numDevices > 0 ) {
		devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

		/* Now, get the device list data */

		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
		if (status != CL_SUCCESS)
		{
			applog(LOG_ERR, "Error: Getting Device IDs (list)");
			return NULL;
		}

		applog(LOG_INFO, "List of devices:");

		unsigned int i;
		for(i=0; i<numDevices; i++) {
			char pbuff[100];
			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS)
			{
				applog(LOG_ERR, "Error: Getting Device Info");
				return NULL;
			}

			applog(LOG_INFO, "\t%i\t%s", i, pbuff);
		}

		if (gpu < numDevices) {
			char pbuff[100];
			status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, &nDevices);
			if (status != CL_SUCCESS)
			{
				applog(LOG_ERR, "Error: Getting Device Info");
				return NULL;
			}

			applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
			strncpy(name, pbuff, nameSize);
		} else {
			applog(LOG_ERR, "Invalid GPU %i", gpu);
			return NULL;
		}

	} else return NULL;

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };

	clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
	if (status != CL_SUCCESS)
	{
		applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)");
		return NULL;
	}

	/* Check for BFI INT support. Hopefully people don't mix devices with
	 * and without it! */
	char * extensions = malloc(1024);
	const char * camo = "cl_amd_media_ops";
	char *find;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS");
		return NULL;
	}
	find = strstr(extensions, camo);
	if (find)
		clState->hasBitAlign = patchbfi = 1;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT");
		return NULL;
	}
	if (opt_debug)
		applog(LOG_DEBUG, "Preferred vector width reported %d", clState->preferred_vwidth);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE");
		return NULL;
	}
	if (opt_debug)
		applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);

	/* For some reason 2 vectors is still better even if the card says
	 * otherwise, and many cards lie about their max so use 256 as max
	 * unless explicitly set on the command line */
	if (clState->preferred_vwidth > 1)
		clState->preferred_vwidth = 2;
	if (opt_vectors)
		clState->preferred_vwidth = opt_vectors;
	if (opt_worksize && opt_worksize <= clState->max_work_size)
		clState->work_size = opt_worksize;
	else
		clState->work_size = (clState->max_work_size <= 256 ? clState->max_work_size : 256) /
				clState->preferred_vwidth;

	/* Create binary filename based on parameters passed to opencl
	 * compiler to ensure we only load a binary that matches what would
	 * have otherwise created. The filename is:
	 * name + kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin
	 */
	char binaryfilename[255];
	char numbuf[10];
	char filename[16];

	if (chosen_kernel == KL_NONE) {
		if (clState->hasBitAlign)
			chosen_kernel = KL_PHATK;
		else
			chosen_kernel = KL_POCLBM;
	}

	switch (chosen_kernel) {
		case KL_POCLBM:
			strcpy(filename, "poclbm110817.cl");
			strcpy(binaryfilename, "poclbm110817");
			break;
		case KL_NONE: /* Shouldn't happen */
		case KL_PHATK:
			strcpy(filename, "phatk110817.cl");
			strcpy(binaryfilename, "phatk110817");
			break;
	}

	FILE *binaryfile;
	size_t *binary_sizes;
	char **binaries;
	int pl;
	char *source, *rawsource = file_contents(filename, &pl);
	size_t sourceSize[] = {(size_t)pl};

	if (!rawsource)
		return NULL;

	source = malloc(pl);
	if (!source) {
		applog(LOG_ERR, "Unable to malloc source");
		return NULL;
	}

	binary_sizes = (size_t *)malloc(sizeof(size_t)*nDevices);
	if (unlikely(!binary_sizes)) {
		applog(LOG_ERR, "Unable to malloc binary_sizes");
		return NULL;
	}
	binaries = (char **)malloc(sizeof(char *)*nDevices);
	if (unlikely(!binaries)) {
		applog(LOG_ERR, "Unable to malloc binaries");
		return NULL;
	}

	strcat(binaryfilename, name);
	if (clState->hasBitAlign)
		strcat(binaryfilename, "bitalign");

	strcat(binaryfilename, "v");
	sprintf(numbuf, "%d", clState->preferred_vwidth);
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, "w");
	sprintf(numbuf, "%d", (int)clState->work_size);
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, "long");
	sprintf(numbuf, "%d", (int)sizeof(long));
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, ".bin");

	binaryfile = fopen(binaryfilename, "rb");
	if (!binaryfile) {
		if (opt_debug)
			applog(LOG_DEBUG, "No binary found, generating from source");
	} else {
		struct stat binary_stat;

		if (unlikely(stat(binaryfilename, &binary_stat))) {
			if (opt_debug)
				applog(LOG_DEBUG, "Unable to stat binary, generating from source");
			fclose(binaryfile);
			goto build;
		}
		binary_sizes[gpu] = binary_stat.st_size;
		binaries[gpu] = (char *)malloc(binary_sizes[gpu]);
		if (unlikely(!binaries[gpu])) {
			applog(LOG_ERR, "Unable to malloc binaries");
			fclose(binaryfile);
			return NULL;
		}

		if (fread(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu]) {
			applog(LOG_ERR, "Unable to fread binaries[gpu]");
			fclose(binaryfile);
			goto build;
		}
		fclose(binaryfile);

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL);
		if (status != CL_SUCCESS)
		{
			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
			return NULL;
		}
		if (opt_debug)
			applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);

		free(binaries[gpu]);
		goto built;
	}

	/////////////////////////////////////////////////////////////////
	// Load CL file, build CL program object, create CL kernel object
	/////////////////////////////////////////////////////////////////

build:
	memcpy(source, rawsource, pl);

	/* Patch the source file with the preferred_vwidth */
	if (clState->preferred_vwidth > 1) {
		char *find = strstr(source, "VECTORSX");

		if (unlikely(!find)) {
			applog(LOG_ERR, "Unable to find VECTORSX in source");
			return NULL;
		}
		find += 7; // "VECTORS"
		if (clState->preferred_vwidth == 2)
			strncpy(find, "2", 1);
		else
			strncpy(find, "4", 1);
		if (opt_debug)
			applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->preferred_vwidth);
	}

	/* Patch the source file defining BITALIGN */
	if (clState->hasBitAlign) {
		char *find = strstr(source, "BITALIGNX");

		if (unlikely(!find)) {
			applog(LOG_ERR, "Unable to find BITALIGNX in source");
			return NULL;
		}
		find += 8; // "BITALIGN"
		strncpy(find, " ", 1);
		if (opt_debug)
			applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BITALIGN");
	} else if (opt_debug)
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BITALIGN patch");

	if (patchbfi) {
		char *find = strstr(source, "BFI_INTX");

		if (unlikely(!find)) {
			applog(LOG_ERR, "Unable to find BFI_INTX in source");
			return NULL;
		}
		find += 7; // "BFI_INT"
		strncpy(find, " ", 1);
		if (opt_debug)
			applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BFI_INT");
	} else if (opt_debug)
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BFI_INT patch");

	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithSource)");
		return NULL;
	}

	clRetainProgram(clState->program);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
		return NULL;
	}

	/* create a cl program executable for all the devices specified */
	char CompilerOptions[256];
	sprintf(CompilerOptions, "%s%i", "-DWORKSIZE=", (int)clState->work_size);
	//int n = 1000;
	//while(n--)
	//	printf("%s", CompilerOptions);
	//return 1;
	status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);

	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Building Program (clBuildProgram)");
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_INFO, "%s", log);
		return NULL;
	}

	status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetPlatformInfo)");
		return NULL;
	}

	/* copy over all of the generated binaries. */
	if (opt_debug)
		applog(LOG_DEBUG, "binary size %d : %d", gpu, binary_sizes[gpu]);
	if (!binary_sizes[gpu]) {
		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, may need to reboot!");
		return NULL;
	}
	binaries[gpu] = (char *)malloc( sizeof(char)*binary_sizes[gpu]);
	status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)");
		return NULL;
	}

	/* Patch the kernel if the hardware supports BFI_INT */
	if (patchbfi) {
		unsigned remaining = binary_sizes[gpu];
		char *w = binaries[gpu];
		unsigned int start, length;

		/* Find 2nd incidence of .text, and copy the program's
		* position and length at a fixed offset from that. Then go
		* back and find the 2nd incidence of \x7ELF (rewind by one
		* from ELF) and then patch the opcocdes */
		if (!advance(&w, &remaining, ".text"))
			{patchbfi = 0; goto build;}
		w++; remaining--;
		if (!advance(&w, &remaining, ".text")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		memcpy(&start, w + 285, 4);
		memcpy(&length, w + 289, 4);
		w = binaries[gpu]; remaining = binary_sizes[gpu];
		if (!advance(&w, &remaining, "ELF"))
			{patchbfi = 0; goto build;}
		w++; remaining--;
		if (!advance(&w, &remaining, "ELF")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		w--; remaining++;
		w += start; remaining -= start;
		if (opt_debug)
			applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
				w, remaining);
		patch_opcodes(w, length);

		status = clReleaseProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Releasing program. (clReleaseProgram)");
			return NULL;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
			return NULL;
		}

		clRetainProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
			return NULL;
		}
	}

	free(source);
	free(rawsource);

	/* Save the binary to be loaded next time */
	binaryfile = fopen(binaryfilename, "wb");
	if (!binaryfile) {
		/* Not a fatal problem, just means we build it again next time */
		if (opt_debug)
			applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
	} else {
		if (unlikely(fwrite(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu])) {
			applog(LOG_ERR, "Unable to fwrite to binaryfile");
			return NULL;
		}
		fclose(binaryfile);
	}
	if (binaries[gpu])
		free(binaries[gpu]);
built:
	free(binaries);
	free(binary_sizes);

	applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT patching, %d vectors and worksize %d",
	       filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size);

	/* create a cl program executable for all the devices specified */
	status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Building Program (clBuildProgram)");
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_INFO, "%s", log);
		return NULL;
	}

	/* get a kernel object handle for a kernel with the given name */
	clState->kernel = clCreateKernel(clState->program, "search", &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)");
		return NULL;
	}

	/////////////////////////////////////////////////////////////////
	// Create an OpenCL command queue
	/////////////////////////////////////////////////////////////////
	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
	if (status != CL_SUCCESS) /* Try again without OOE enable */
		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)");
		return NULL;
	}

	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, BUFFERSIZE, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)");
		return NULL;
	}

	return clState;
}
Esempio n. 13
0
File: hw2.c Progetto: hemantjp/HW2
int
main(int argc, char** argv)
{


   srand(1000);
   int i;

   unsigned int size_A = WA * HA;
   unsigned int mem_size_A = sizeof(float) * size_A;
   float* h_A = (float*) malloc(mem_size_A);

   unsigned int size_B = WB * HB;
   unsigned int mem_size_B = sizeof(float) * size_B;
   float* h_B = (float*) malloc(mem_size_B);


   randomInit(h_A, size_A);
   randomInit(h_B, size_B);


   unsigned int size_C = WC * HC;
   unsigned int mem_size_C = sizeof(float) * size_C;
   float* h_C = (float*) malloc(mem_size_C);

   cl_context clGPUContext;
   cl_command_queue clCommandQue;
   cl_program clProgram;
   cl_kernel clKernel;
   cl_event mm;

   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;


   cl_mem d_A;
   cl_mem d_B;
   cl_mem d_C;


   clGPUContext = clCreateContextFromType(0,
                   CL_DEVICE_TYPE_GPU,
                   NULL, NULL, &errcode);



   errcode = clGetContextInfo(clGPUContext,
              CL_CONTEXT_DEVICES, 0, NULL,
              &dataBytes);
   cl_device_id *clDevices = (cl_device_id *)
              malloc(dataBytes);
   errcode |= clGetContextInfo(clGPUContext,
              CL_CONTEXT_DEVICES, dataBytes,
              clDevices, NULL);



   clCommandQue = clCreateCommandQueue(clGPUContext,
                  clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);



   d_C = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_B, h_B, &errcode);


   FILE* fp = fopen("hw2.cl", "r");
   fseek (fp , 0 , SEEK_END);
   const size_t lSize = ftell(fp);
   rewind(fp);
   unsigned char* buffer;
   buffer = (unsigned char*) malloc (lSize);
   fread(buffer, 1, lSize, fp);
   fclose(fp);

   cl_int status;
   clProgram = clCreateProgramWithBinary(clGPUContext,
                1, (const cl_device_id *)clDevices,
                &lSize, (const unsigned char**)&buffer,
                &status, &errcode);
   errcode = clBuildProgram(clProgram, 0, NULL, NULL,
                NULL, NULL);


   errcode = clBuildProgram(clProgram, 0,
              NULL, NULL, NULL, NULL);


   clKernel = clCreateKernel(clProgram,
               "MM", &errcode);




   size_t globalWorkSize[2];

   int wA = WA;
   int wC = WC;
   errcode = clSetKernelArg(clKernel, 0,
              sizeof(cl_mem), (void *)&d_C);
   errcode |= clSetKernelArg(clKernel, 1,
              sizeof(cl_mem), (void *)&d_A);
   errcode |= clSetKernelArg(clKernel, 2,
              sizeof(cl_mem), (void *)&d_B);
   errcode |= clSetKernelArg(clKernel, 3,
              sizeof(int), (void *)&wA);
   errcode |= clSetKernelArg(clKernel, 4,
              sizeof(int), (void *)&wC);



   globalWorkSize[0] = 16;
   globalWorkSize[1] = 16;

   cl_ulong time_start, time_end, total_time = 0;

   errcode = clEnqueueNDRangeKernel(clCommandQue,
              clKernel, 2, NULL, globalWorkSize,
              NULL, 0, NULL, &mm);
   printf("Average time = %lu\n");
   clFinish(clCommandQue);

         clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START,
              sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END,
               sizeof(time_end), &time_end, NULL);
         total_time += time_end - time_start;


         printf("Average time = %lu\n", total_time);
   errcode = clEnqueueReadBuffer(clCommandQue,
              d_C, CL_TRUE, 0, mem_size_C,
              h_C, 0, NULL, NULL);



   free(h_A);
   free(h_B);
   free(h_C);

   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);

   free(clDevices);

   clReleaseContext(clGPUContext);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseCommandQueue(clCommandQue);

}
Esempio n. 14
0
WEAK void halide_init_kernels(void *user_context, const char* src, int size) {
    int err;
    cl_device_id dev;
    // Initialize one shared context for all Halide compiled instances
    if (!(*cl_ctx)) {
        const cl_uint maxPlatforms = 4;
        cl_platform_id platforms[maxPlatforms];
        cl_uint platformCount = 0;

        err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount );
        CHECK_ERR( err, "clGetPlatformIDs" );

        cl_platform_id platform = NULL;

        // Find the requested platform, or the first if none specified.
        const char * name = getenv("HL_OCL_PLATFORM");
        if (name != NULL) {
            for (cl_uint i = 0; i < platformCount; ++i) {
                const cl_uint maxPlatformName = 256;
                char platformName[maxPlatformName];
                err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
                if (err != CL_SUCCESS) continue;

                if (strstr(platformName, name))
                {
                    platform = platforms[i];
                    break;
                }
            }
        } else if (platformCount > 0) {
            platform = platforms[0];
        }
        if (platform == NULL){
            halide_printf(user_context, "Failed to find OpenCL platform\n");
            return;
        }

        #ifdef DEBUG
        const cl_uint maxPlatformName = 256;
        char platformName[maxPlatformName];
        err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
        CHECK_ERR( err, "clGetPlatformInfo" );

        halide_printf(user_context, "Got platform '%s', about to create context (t=%lld)\n",
                      platformName, (long long)halide_current_time_ns(user_context));
        #endif

        cl_device_type device_type = 0;
        // Find the device types requested.
        const char * dev_type = getenv("HL_OCL_DEVICE");
        if (dev_type != NULL) {
            if (strstr("cpu", dev_type))
                device_type |= CL_DEVICE_TYPE_CPU;
            if (strstr("gpu", dev_type))
                device_type |= CL_DEVICE_TYPE_GPU;
        } 
        // If no devices are specified yet, just use all.
        if (device_type == 0)
            device_type = CL_DEVICE_TYPE_ALL;
        
        // Make sure we have a device
        const cl_uint maxDevices = 4;
        cl_device_id devices[maxDevices];
        cl_uint deviceCount = 0;
        err = clGetDeviceIDs( platform, device_type, maxDevices, devices, &deviceCount );
        CHECK_ERR( err, "clGetDeviceIDs" );
        if (deviceCount == 0) {
            halide_printf(user_context, "Failed to get device\n");
            return;
        }

        dev = devices[deviceCount-1];

        #ifdef DEBUG
        const cl_uint maxDeviceName = 256;
        char deviceName[maxDeviceName];
        err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL );
        CHECK_ERR( err, "clGetDeviceInfo" );

        halide_printf(user_context, "Got device '%s', about to create context (t=%lld)\n",
                      deviceName, (long long)halide_current_time_ns(user_context));
        #endif


        // Create context
        cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
        *cl_ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err);
        CHECK_ERR( err, "clCreateContext" );
        // cuEventCreate(&__start, 0);
        // cuEventCreate(&__end, 0);

        halide_assert(user_context, !(*cl_q));
        *cl_q = clCreateCommandQueue(*cl_ctx, dev, 0, &err);
        CHECK_ERR( err, "clCreateCommandQueue" );
    } else {
        #ifdef DEBUG
        halide_printf(user_context, "Already had context %p\n", *cl_ctx);
        #endif

        // Maintain ref count of context.
        CHECK_CALL( clRetainContext(*cl_ctx), "clRetainContext" );
        CHECK_CALL( clRetainCommandQueue(*cl_q), "clRetainCommandQueue" );

        CHECK_CALL( clGetContextInfo(*cl_ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL), "clGetContextInfo" );
    }

    // Initialize a module for just this Halide module
    if ((!__mod) && (size > 1)) {
        // Create module

        cl_device_id devices[] = { dev };
        size_t lengths[] = { size };

        if (strstr(src, "/*OpenCL C*/")) {
            // Program is OpenCL C.

            #ifdef DEBUG
            halide_printf(user_context, "Compiling OpenCL C kernel: %s\n\n", src);
            #endif

            const char * sources[] = { src };
            __mod = clCreateProgramWithSource(*cl_ctx, 1, &sources[0], NULL, &err );
            CHECK_ERR( err, "clCreateProgramWithSource" );
        } else {
            // Program is SPIR binary.

            #ifdef DEBUG
            halide_printf(user_context, "Compiling SPIR kernel (%i bytes)\n", size);
            #endif

            const unsigned char * binaries[] = { (unsigned char *)src };
            __mod = clCreateProgramWithBinary(*cl_ctx, 1, devices, lengths, &binaries[0], NULL, &err );
            CHECK_ERR( err, "clCreateProgramWithBinary" );
        }

        err = clBuildProgram( __mod, 1, &dev, NULL, NULL, NULL );
        if (err != CL_SUCCESS) {
            size_t len;
            char buffer[2048];

            halide_printf(user_context, "Error: Failed to build program executable! err = %d\n", err);
            if (clGetProgramBuildInfo(__mod, dev, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len) == CL_SUCCESS)
                halide_printf(user_context, "Build Log:\n %s\n-----\n", buffer);
            else
                halide_printf(user_context, "clGetProgramBuildInfo failed to get build log!\n");
            halide_assert(user_context, err == CL_SUCCESS);
        }
    }
}
Esempio n. 15
0
int main(int argc, char** argv)
{
  int err;                            // error code returned from api calls
     
  float a1[DATA_SIZE1];               // original data set given to device
  float b1[FILTER_SIZE1];             // original data set given to device
  float c1[OUTPUT_SIZE1];
  float results1[OUTPUT_SIZE1];       // results returned from device
  float sw_results1[OUTPUT_SIZE1];     // results returned from device

  unsigned int correct;               // number of correct results returned

  size_t global[2];                   // global domain size for our calculation
  size_t local[2];                    // local domain size for our calculation

  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel
   
  char cl_platform_vendor[1001];
  char cl_platform_name[1001];
   
  cl_mem input_a;                     // device memory used for the input array
  cl_mem input_b;                     // device memory used for the input array
  cl_mem output;                      // device memory used for the output array
   
  if (argc != 2){
    printf("%s <inputfile>\n", argv[0]);
    return EXIT_FAILURE;
  }

  // Fill our data sets with pattern
  //
  int i = 0;
  for(i = 0; i < DATA_SIZE1; i++) {
    a1[i] = (float)1;
  }
  for(i = 0; i < OUTPUT_SIZE1; i++) {
    results1[i] = 0;
    sw_results1[i] = FILTER_SIZE1;
  }
  for(i = 0; i < FILTER_SIZE1; i++) {
    b1[i] = (float)1;
  }
  for(i = 0; i < OUTPUT_SIZE1; i++) {
    c1[i] = (float)0;
  }

  // Connect to first platform
  //
  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);
 
  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  
  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create a command commands
  //
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char *xclbin=argv[1];
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "conv3_layer", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create the input and output arrays in device memory for our calculation
  //
  input_a = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * DATA_SIZE1, NULL, NULL);
  input_b = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * FILTER_SIZE1, NULL, NULL);
  output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * OUTPUT_SIZE1, NULL, NULL);
  if (!input_a || !input_b || !output)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }    
    
  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, input_a, CL_TRUE, 0, sizeof(float) * DATA_SIZE1, a1, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array a!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, input_b, CL_TRUE, 0, sizeof(float) * FILTER_SIZE1, b1, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array b!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  err = clEnqueueWriteBuffer(commands, output, CL_TRUE, 0, sizeof(float) * OUTPUT_SIZE1, c1, 0, NULL, NULL);
    
  // Set the arguments to our compute kernel
  //
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_a);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input_b);
  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

#ifdef C_KERNEL
  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);
#else
  global[0] = MATRIX_RANK;
  global[1] = MATRIX_RANK;
  local[0] = MATRIX_RANK;
  local[1] = MATRIX_RANK;
  err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, 
                               (size_t*)&global, (size_t*)&local, 0, NULL, NULL);
#endif
  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * OUTPUT_SIZE1, results1, 0, NULL, &readevent );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  clWaitForEvents(1, &readevent);
    
  printf("A\n");
  for (i=0;i<DATA_SIZE1;i++) {
    printf("%f ",a1[i]);
    if (((i+1) % NUM_DATA_ROWS) == 0)
      printf("\n");
  }
  printf("B\n");
  for (i=0;i< FILTER_SIZE1;i++) {
    printf("%f ",b1[i]);
    if (((i+1) % NUM_MASK_ROWS) == 0)
      printf("\n");
  }
  printf("res\n");
  for (i=0;i< OUTPUT_SIZE1;i++) {
    printf("%f ",results1[i]);
    if (((i+1) % NUM_OUT_ROWS) == 0)
      printf("\n");
  }
    
  // Validate our results
  //
  correct = 0;
  /* for(i = 0; i < OUTPUT_SIZE1; i++)
  {
    int row = i/MATRIX_RANK;
    int col = i%MATRIX_RANK;
    int running = 0;
    int index;
    for (index=0;index<MATRIX_RANK;index++) {
      int aIndex = row*MATRIX_RANK + index;
      int bIndex = col + index*MATRIX_RANK;
      running += a[aIndex] * b[bIndex];
    }
    sw_results[i] = running;
    }*/
    
  for (i = 0;i < OUTPUT_SIZE1; i++) 
    if(results1[i] == sw_results1[i])
      correct++;
  printf("Software\n");
  for (i=0;i<OUTPUT_SIZE1;i++) {
    //printf("%0.2f ",sw_results[i]);
    printf("%f ",sw_results1[i]);
    if (((i+1) % NUM_OUT_ROWS) == 0)
      printf("\n");
  }
    
    
  // Print a brief summary detailing the results
  //
  printf("Computed '%d/%d' correct values!\n", correct, OUTPUT_SIZE1);
    
  // Shutdown and cleanup
  //
  clReleaseMemObject(input_a);
  clReleaseMemObject(input_b);
  clReleaseMemObject(output);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

  if(correct == OUTPUT_SIZE1){
    printf("Test passed!\n");
    return EXIT_SUCCESS;
  }
  else{
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
}
Esempio n. 16
0
_clState *initCl(unsigned int gpu, char *name, size_t nameSize)
{
	_clState *clState = (_clState *)calloc(1, sizeof(_clState));
	bool patchbfi = false, prog_built = false;
	struct cgpu_info *cgpu = &gpus[gpu];
	cl_platform_id platform = NULL;
	char pbuff[256], vbuff[255];
	cl_platform_id* platforms;
	cl_uint preferred_vwidth;
	cl_device_id *devices;
	cl_uint numPlatforms;
	cl_uint numDevices;
	cl_int status;

	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status);
		return NULL;
	}

	platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
	status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status);
		return NULL;
	}

	if (opt_platform_id >= (int)numPlatforms) {
		applog(LOG_ERR, "Specified platform that does not exist");
		return NULL;
	}

	status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status);
		return NULL;
	}
	platform = platforms[opt_platform_id];

	if (platform == NULL) {
		perror("NULL platform found!\n");
		return NULL;
	}

	applog(LOG_INFO, "CL Platform vendor: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform name: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform version: %s", vbuff);

	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status);
		return NULL;
	}

	if (numDevices > 0 ) {
		devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

		/* Now, get the device list data */

		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status);
			return NULL;
		}

		applog(LOG_INFO, "List of devices:");

		unsigned int i;
		for (i = 0; i < numDevices; i++) {
			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error %d: Getting Device Info", status);
				return NULL;
			}

			applog(LOG_INFO, "\t%i\t%s", i, pbuff);
		}

		if (gpu < numDevices) {
			status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error %d: Getting Device Info", status);
				return NULL;
			}

			applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
			strncpy(name, pbuff, nameSize);
		} else {
			applog(LOG_ERR, "Invalid GPU %i", gpu);
			return NULL;
		}

	} else return NULL;

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };

	clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status);
		return NULL;
	}

	/////////////////////////////////////////////////////////////////
	// Create an OpenCL command queue
	/////////////////////////////////////////////////////////////////
	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
	if (status != CL_SUCCESS) /* Try again without OOE enable */
		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
		return NULL;
	}

	/* Check for BFI INT support. Hopefully people don't mix devices with
	 * and without it! */
	char * extensions = (char *)malloc(1024);
	const char * camo = "cl_amd_media_ops";
	char *find;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status);
		return NULL;
	}
	find = strstr(extensions, camo);
	if (find)
		clState->hasBitAlign = true;

	/* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
	char * devoclver = (char *)malloc(1024);
	const char * ocl10 = "OpenCL 1.0";
	const char * ocl11 = "OpenCL 1.1";

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status);
		return NULL;
	}
	find = strstr(devoclver, ocl10);
	if (!find) {
		clState->hasOpenCL11plus = true;
		find = strstr(devoclver, ocl11);
		if (!find)
			clState->hasOpenCL12plus = true;
	}

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size));

	size_t compute_units = 0;
	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), (void *)&compute_units, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_COMPUTE_UNITS", status);
		return NULL;
	}
	// AMD architechture got 64 compute shaders per compute unit.
	// Source: http://www.amd.com/us/Documents/GCN_Architecture_whitepaper.pdf
	clState->compute_shaders = compute_units * 64;
	applog(LOG_DEBUG, "Max shaders calculated %d", (int)(clState->compute_shaders));

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Max mem alloc size is %lu", (long unsigned int)(cgpu->max_alloc));

	/* Create binary filename based on parameters passed to opencl
	 * compiler to ensure we only load a binary that matches what
	 * would have otherwise created. The filename is:
	 * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + nf + nfactor + w + work_size + l + sizeof(long) + .bin
	 */
	char binaryfilename[255];
	char filename[255];
	char strbuf[32];

	if (cgpu->kernelname == NULL) {
		applog(LOG_INFO, "No kernel specified, defaulting to ckolivas");
		cgpu->kernelname = strdup("ckolivas");
	}



	if (strcmp(cgpu->kernelname, ALEXKARNEW_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel alexkarnew is experimental.");
		strcpy(filename, ALEXKARNEW_KERNNAME".cl");
		strcpy(binaryfilename, ALEXKARNEW_KERNNAME);
	} else if (strcmp(cgpu->kernelname, ALEXKAROLD_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel alexkarold is experimental.");
		strcpy(filename, ALEXKAROLD_KERNNAME".cl");
		strcpy(binaryfilename, ALEXKAROLD_KERNNAME);
	} else if (strcmp(cgpu->kernelname, CKOLIVAS_KERNNAME) == 0){
		strcpy(filename, CKOLIVAS_KERNNAME".cl");
		strcpy(binaryfilename, CKOLIVAS_KERNNAME);
	} else if (strcmp(cgpu->kernelname, PSW_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel psw is experimental.");
		strcpy(filename, PSW_KERNNAME".cl");
		strcpy(binaryfilename, PSW_KERNNAME);
	} else if (strcmp(cgpu->kernelname, ZUIKKIS_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel zuikkis is experimental.");
		strcpy(filename, ZUIKKIS_KERNNAME".cl");
		strcpy(binaryfilename, ZUIKKIS_KERNNAME);
		/* Kernel only supports worksize 256 */
		cgpu->work_size = 256;
	} else if (strcmp(cgpu->kernelname, DARKCOIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel darkcoin is experimental.");
		strcpy(filename, DARKCOIN_KERNNAME".cl");
		strcpy(binaryfilename, DARKCOIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, QUBITCOIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel qubitcoin is experimental.");
		strcpy(filename, QUBITCOIN_KERNNAME".cl");
		strcpy(binaryfilename, QUBITCOIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, QUARKCOIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel quarkcoin is experimental.");
		strcpy(filename, QUARKCOIN_KERNNAME".cl");
		strcpy(binaryfilename, QUARKCOIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, FUGUECOIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel fuguecoin is experimental.");
		strcpy(filename, FUGUECOIN_KERNNAME".cl");
		strcpy(binaryfilename, FUGUECOIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, INKCOIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel inkcoin is experimental.");
		strcpy(filename, INKCOIN_KERNNAME".cl");
		strcpy(binaryfilename, INKCOIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, ANIMECOIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel animecoin is experimental.");
		strcpy(filename, ANIMECOIN_KERNNAME".cl");
		strcpy(binaryfilename, ANIMECOIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, GROESTLCOIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel groestlcoin is experimental.");
		strcpy(filename, GROESTLCOIN_KERNNAME".cl");
		strcpy(binaryfilename, GROESTLCOIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, SIFCOIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel groestlcoin is experimental.");
		strcpy(filename, SIFCOIN_KERNNAME".cl");
		strcpy(binaryfilename, SIFCOIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, MYRIADCOIN_GROESTL_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel myriadcoin-groestl is experimental.");
		strcpy(filename, MYRIADCOIN_GROESTL_KERNNAME".cl");
		strcpy(binaryfilename, MYRIADCOIN_GROESTL_KERNNAME);
	} else if (strcmp(cgpu->kernelname, MYRIADCOIN_SKEIN_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel myriadcoin-skein is experimental.");
		strcpy(filename, MYRIADCOIN_SKEIN_KERNNAME".cl");
		strcpy(binaryfilename, MYRIADCOIN_SKEIN_KERNNAME);
	} else if (strcmp(cgpu->kernelname, MYRIADCOIN_QUBIT_KERNNAME) == 0){
		applog(LOG_WARNING, "Kernel myriadcoin-qubit is experimental.");
		strcpy(filename, MYRIADCOIN_QUBIT_KERNNAME".cl");
		strcpy(binaryfilename, MYRIADCOIN_QUBIT_KERNNAME);
	} else {
		applog(LOG_WARNING, "Kernel was not chosen.");
	}

	/* For some reason 2 vectors is still better even if the card says
	 * otherwise, and many cards lie about their max so use 256 as max
	 * unless explicitly set on the command line. Tahiti prefers 1 */
	if (strstr(name, "Tahiti"))
		preferred_vwidth = 1;
	else if (preferred_vwidth > 2)
		preferred_vwidth = 2;

	/* All available kernels only support vector 1 */
	cgpu->vwidth = 1;

	/* Vectors are hard-set to 1 above. */
	if (likely(cgpu->vwidth))
		clState->vwidth = cgpu->vwidth;
	else {
		clState->vwidth = preferred_vwidth;
		cgpu->vwidth = preferred_vwidth;
	}

	clState->goffset = true;

	if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
		clState->wsize = cgpu->work_size;
	else
		clState->wsize = 256;

	if (!cgpu->opt_lg) {
		applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
		cgpu->lookup_gap = 2;
	} else
		cgpu->lookup_gap = cgpu->opt_lg;

	if ((strcmp(cgpu->kernelname, "zuikkis") == 0) && (cgpu->lookup_gap != 2)) {
		applog(LOG_WARNING, "Kernel zuikkis only supports lookup-gap = 2 (currently %d), forcing.", cgpu->lookup_gap);
		cgpu->lookup_gap = 2;
	}

	if (!cgpu->opt_tc) {
		unsigned int sixtyfours;

		sixtyfours =  cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1;
		cgpu->thread_concurrency = sixtyfours * 64;
		if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
			cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
			if (cgpu->thread_concurrency > cgpu->shaders * 5)
				cgpu->thread_concurrency = cgpu->shaders * 5;
		}
		applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency));
	} else
		cgpu->thread_concurrency = cgpu->opt_tc;


	FILE *binaryfile;
	size_t *binary_sizes;
	char **binaries;
	int pl;
	char *source = file_contents(filename, &pl);
	size_t sourceSize[] = {(size_t)pl};
	cl_uint slot, cpnd;

	slot = cpnd = 0;

	if (!source)
		return NULL;

	binary_sizes = (size_t *)calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binary_sizes)) {
		applog(LOG_ERR, "Unable to calloc binary_sizes");
		return NULL;
	}
	binaries = (char **)calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binaries)) {
		applog(LOG_ERR, "Unable to calloc binaries");
		return NULL;
	}

	strcat(binaryfilename, name);
	if (clState->goffset)
		strcat(binaryfilename, "g");

	sprintf(strbuf, "lg%utc%unf%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, algorithm->nfactor);
	strcat(binaryfilename, strbuf);

	sprintf(strbuf, "w%d", (int)clState->wsize);
	strcat(binaryfilename, strbuf);
	sprintf(strbuf, "l%d", (int)sizeof(long));
	strcat(binaryfilename, strbuf);
	strcat(binaryfilename, ".bin");

	binaryfile = fopen(binaryfilename, "rb");
	if (!binaryfile) {
		applog(LOG_DEBUG, "No binary found, generating from source");
	} else {
		struct stat binary_stat;

		if (unlikely(stat(binaryfilename, &binary_stat))) {
			applog(LOG_DEBUG, "Unable to stat binary, generating from source");
			fclose(binaryfile);
			goto build;
		}
		if (!binary_stat.st_size)
			goto build;

		binary_sizes[slot] = binary_stat.st_size;
		binaries[slot] = (char *)calloc(binary_sizes[slot], 1);
		if (unlikely(!binaries[slot])) {
			applog(LOG_ERR, "Unable to calloc binaries");
			fclose(binaryfile);
			return NULL;
		}

		if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) {
			applog(LOG_ERR, "Unable to fread binaries");
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		fclose(binaryfile);
		applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);

		goto built;
	}

	/////////////////////////////////////////////////////////////////
	// Load CL file, build CL program object, create CL kernel object
	/////////////////////////////////////////////////////////////////

build:
	applog(LOG_NOTICE, "Building binary %s", binaryfilename);

	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
		return NULL;
	}

	/* create a cl program executable for all the devices specified */
	char *CompilerOptions = (char *)calloc(1, 256);

	sprintf(CompilerOptions, "-I \"%s\" -I \"%s\" -I \"%skernel\" -I \".\" -D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d -D NFACTOR=%d",
                        opt_kernel_path, sgminer_path, sgminer_path,
			cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize, (unsigned int)algorithm->nfactor);

	applog(LOG_DEBUG, "Setting worksize to %d", (int)(clState->wsize));
	if (clState->vwidth > 1)
		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);

	if (clState->hasBitAlign) {
		strcat(CompilerOptions, " -D BITALIGN");
		applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
		if (!clState->hasOpenCL12plus &&
		    (strstr(name, "Cedar") ||
		     strstr(name, "Redwood") ||
		     strstr(name, "Juniper") ||
		     strstr(name, "Cypress" ) ||
		     strstr(name, "Hemlock" ) ||
		     strstr(name, "Caicos" ) ||
		     strstr(name, "Turks" ) ||
		     strstr(name, "Barts" ) ||
		     strstr(name, "Cayman" ) ||
		     strstr(name, "Antilles" ) ||
		     strstr(name, "Wrestler" ) ||
		     strstr(name, "Zacate" ) ||
		     strstr(name, "WinterPark" )))
			patchbfi = true;
	} else
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");

	if (patchbfi) {
		strcat(CompilerOptions, " -D BFI_INT");
		applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
	} else
		applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");

	if (clState->goffset)
		strcat(CompilerOptions, " -D GOFFSET");

	if (!clState->hasOpenCL11plus)
		strcat(CompilerOptions, " -D OCL1");

	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
	status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);
	free(CompilerOptions);

	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = (char *)malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_ERR, "%s", log);
		return NULL;
	}

	prog_built = true;

#ifdef __APPLE__
	/* OSX OpenCL breaks reading off binaries with >1 GPU so always build
	 * from source. */
	goto built;
#endif

	status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
		return NULL;
	}

	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
		return NULL;
	}

	/* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
	 * to iterate over all the binary slots and find where the real program
	 * is. What the heck is this!? */
	for (slot = 0; slot < cpnd; slot++)
		if (binary_sizes[slot])
			break;

	/* copy over all of the generated binaries. */
	applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, (int)(binary_sizes[slot]));
	if (!binary_sizes[slot]) {
		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
		return NULL;
	}
	binaries[slot] = (char *)calloc(sizeof(char)* binary_sizes[slot], 1);
	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
		return NULL;
	}

	/* Patch the kernel if the hardware supports BFI_INT but it needs to
	 * be hacked in */
	if (patchbfi) {
		unsigned remaining = binary_sizes[slot];
		char *w = binaries[slot];
		unsigned int start, length;

		/* Find 2nd incidence of .text, and copy the program's
		* position and length at a fixed offset from that. Then go
		* back and find the 2nd incidence of \x7ELF (rewind by one
		* from ELF) and then patch the opcocdes */
		if (!advance(&w, &remaining, ".text"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, ".text")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		memcpy(&start, w + 285, 4);
		memcpy(&length, w + 289, 4);
		w = binaries[slot]; remaining = binary_sizes[slot];
		if (!advance(&w, &remaining, "ELF"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, "ELF")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		w--; remaining++;
		w += start; remaining -= start;
		applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
			w, remaining);
		patch_opcodes(w, length);

		status = clReleaseProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
			return NULL;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
			return NULL;
		}

		/* Program needs to be rebuilt */
		prog_built = false;
	}

	free(source);

	/* Save the binary to be loaded next time */
	binaryfile = fopen(binaryfilename, "wb");
	if (!binaryfile) {
		/* Not fatal, just means we build it again next time */
		applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
	} else {
		if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) {
			applog(LOG_ERR, "Unable to fwrite to binaryfile");
			return NULL;
		}
		fclose(binaryfile);
	}
built:
	if (binaries[slot])
		free(binaries[slot]);
	free(binaries);
	free(binary_sizes);

	applog(LOG_NOTICE, "Initialising kernel %s with%s bitalign, %spatched BFI",
	       filename, clState->hasBitAlign ? "" : "out", patchbfi ? "" : "un");

	if (!prog_built) {
		/* create a cl program executable for all the devices specified */
		status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
			size_t logSize;
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

			char *log = (char *)malloc(logSize);
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
			applog(LOG_ERR, "%s", log);
			return NULL;
		}
	}

	/* get a kernel object handle for a kernel with the given name */
	clState->kernel = clCreateKernel(clState->program, "search", &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status);
		return NULL;
	}

	size_t ipt = (algorithm->n / cgpu->lookup_gap +
		      (algorithm->n % cgpu->lookup_gap > 0));
	size_t bufsize = 128 * ipt * cgpu->thread_concurrency;

	/* Use the max alloc value which has been rounded to a power of
	 * 2 greater >= required amount earlier */
	if (bufsize > cgpu->max_alloc) {
		applog(LOG_WARNING, "Maximum buffer memory device %d supports says %lu",
			   gpu, (unsigned long)(cgpu->max_alloc));
		applog(LOG_WARNING, "Your scrypt settings come to %lu", (unsigned long)bufsize);
	}
	applog(LOG_DEBUG, "Creating scrypt buffer sized %lu", (unsigned long)bufsize);
	clState->padbufsize = bufsize;

	/* This buffer is weird and might work to some degree even if
	 * the create buffer call has apparently failed, so check if we
	 * get anything back before we call it a failure. */
	clState->padbuffer8 = NULL;
	clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
	if (status != CL_SUCCESS && !clState->padbuffer8) {
		applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status);
		return NULL;
	}

	clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
		return NULL;
	}
	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);

	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
		return NULL;
	}

	return clState;
}
Esempio n. 17
0
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
	void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
    cl_context cxGPUContext;       // OpenCL context
    cl_command_queue cqCommandQue;  // OpenCL command que
    cl_device_id* cdDevices;        // OpenCL device list    
    cl_program cpProgram;           // OpenCL program
    cl_kernel ckKernel;             // OpenCL kernel
    cl_mem cmMemObjs[3];            // OpenCL memory buffer objects:  3 for device
    size_t szGlobalWorkSize[1];     // 1D var for Total # of work items
    size_t szLocalWorkSize[1];		// 1D var for # of work items in the work group	
    size_t szParmDataBytes;			// Byte size of context information
    cl_int ciErr1, ciErr2;			// Error code var
    int iTestN = 100000 * 8;		// Size of Vectors to process

    // set Global and Local work size dimensions
    szGlobalWorkSize[0] = iTestN >> 3;  // do 8 computations per work item
    szLocalWorkSize[0]= iTestN>>3;


    // Allocate and initialize host arrays
    srcA = (void *)malloc (sizeof(cl_float) * iTestN);
    srcB = (void *)malloc (sizeof(cl_float) * iTestN);
    dst = (void *)malloc (sizeof(cl_float) * iTestN);

	int i;

	// Initialize arrays with some values
	for (i=0;i<iTestN;i++)
	{
		((cl_float*)srcA)[i] = cl_float(i);
		((cl_float*)srcB)[i] = 2;
		((cl_float*)dst)[i]=-1;
	}

    // Create OpenCL context & context
    cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
	
    // Query all devices available to the context
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
    cdDevices = (cl_device_id*)malloc(szParmDataBytes);
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
	if (cdDevices)
	{
		printDevInfo(cdDevices[0]);
	}

    // Create a command queue for first device the context reported
    cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
    ciErr1 |= ciErr2; 

    // Allocate the OpenCL source and result buffer memory objects on the device GMEM
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
    ciErr1 |= ciErr2;
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
    ciErr1 |= ciErr2;
    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
    ciErr1 |= ciErr2;

///create kernels from binary
	int numDevices = 1;
	cl_int err;
	::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
	const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));

	for (i = 0; i < numDevices; ++i) {
		images[i] = 0;
		lengths[i] = 0;
	}

	cpProgram = clCreateProgramWithBinary(cxGPUContext, numDevices,cdDevices,lengths, images, 0, &err);

	// Build the executable program from a binary
	ciErr1 |= clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);

    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
    
    // Set the Argument values
    ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);

    // Copy input data from host to GPU and launch kernel 
    ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);

    // Read back results and check accumulated errors
    ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);

    // Release kernel, program, and memory objects
	// NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
    free(cdDevices);
	clReleaseKernel(ckKernel);  
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQue);
    clReleaseContext(cxGPUContext);


    // print the results
    int iErrorCount = 0;
    for (i = 0; i < iTestN; i++) 
    {
		if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
			iErrorCount++;
    }
	
	if (iErrorCount)
	{
		printf("MiniCL validation FAILED\n");
	} else
	{
		printf("MiniCL validation SUCCESSFULL\n");
	}
    // Free host memory, close log and return success
	for (i = 0; i < 3; i++)
    {
        clReleaseMemObject(cmMemObjs[i]);
    }

    free(srcA); 
    free(srcB);
    free (dst);
}
bool
initOpenCL(ComputeEnv *env)
{
        int r = cllib_init();
        if (r < 0) {
                return false;
        }

        cl_uint num_plt;
        cl_platform_id plts[16];
        clGetPlatformIDs(16, plts, &num_plt);
        bool found = false;
        cl_int err;

        cl_platform_id platform;
        cl_context context;
        cl_device_id dev;
        cl_command_queue queue;
        cl_kernel ker_filter, ker_filter_in1_out32, ker_filter_in128_out1;
        cl_kernel ker_filter_in3_out32, ker_filter_in128_out3;
        cl_program program = 0;

        for (unsigned int i=0; i<num_plt; i++) {
                size_t sz;
                cl_uint num_dev;

                clGetPlatformInfo(plts[i], CL_PLATFORM_NAME, 0, nullptr, &sz);
                std::vector<char> name(sz);
                clGetPlatformInfo(plts[i], CL_PLATFORM_NAME, sz, &name[0], &sz);

                bool is_amd = strstr(&name[0], "AMD") != NULL;
                bool is_apple = strstr(&name[0], "Apple") != NULL;
                //bool is_intel = strstr(&name[0], "Intel") != NULL;
                //bool is_nvidia = strstr(&name[0], "NVIDIA") != NULL;

                if (!is_amd && !is_apple) {
                        continue;
                }

                clGetDeviceIDs(plts[i], CL_DEVICE_TYPE_GPU, 0, nullptr, &num_dev);
                if (num_dev == 0) {
                        continue;
                }

                std::vector<cl_device_id> devs(num_dev);
                clGetDeviceIDs(plts[i], CL_DEVICE_TYPE_GPU, num_dev, &devs[0], &num_dev);

                platform = plts[i];
                dev = devs[0];

                cl_context_properties props[] =
                        {CL_CONTEXT_PLATFORM, (cl_context_properties)(plts[i]), 0};
                cl_context ctxt = clCreateContext(props, 1, &devs[0], NULL, NULL, &err);
                if (err != CL_SUCCESS) {
                        continue;
                }

                context = ctxt;

                found = true;
                break;
        }

        if (!found) {
                return false;
        }

        size_t dev_name_len;
        clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, nullptr, &dev_name_len);
        std::vector<char> dev_name(dev_name_len+1);
        clGetDeviceInfo(dev, CL_DEVICE_NAME, dev_name_len, &dev_name[0], &dev_name_len);

        bool bin_avaiable = false;

#if defined __linux || _WIN32

#ifdef __linux
        ssize_t path_len = 4;
        char *self_path = (char*)malloc(path_len+1);
        while (1) {
                ssize_t r = readlink("/proc/self/exe", self_path, path_len);
                if (r < path_len) {
                        self_path[r] = '\0';
                        break;
                }

                path_len *= 2;
                self_path = (char*)realloc(self_path, path_len+1);
        }

        struct stat self_st;
        stat(self_path, &self_st);
        self_path = dirname(self_path);
#else
        size_t path_len = 4;
        char *self_path = (char*)malloc(path_len+1);
	DWORD len;
        while (1) {
		len = GetModuleFileName(NULL, self_path, path_len);
		if (len > 0 && len != path_len) {
			break;
		}

                path_len *= 2;
                self_path = (char*)realloc(self_path, path_len+1);
        }
	WIN32_FIND_DATA self_st;
	HANDLE finder = FindFirstFile(self_path, &self_st);
	FindClose(finder);

	for (int si=len-1; si>=0; si--) {
		if (self_path[si] == '\\') {
			self_path[si] = '\0';
			break;
		}
	}
#endif

        std::string bin_path = std::string(self_path) + "/" + &dev_name[0] + ".bin";

        FILE *binfp = fopen(bin_path.c_str(), "rb");
        if (binfp) {
#ifdef __linux
                struct stat bin_st;
                stat(bin_path.c_str(), &bin_st);

                bool old = false;
                if (bin_st.st_mtim.tv_sec < self_st.st_mtim.tv_sec) {
                        old = true;
                }

                if (bin_st.st_mtim.tv_sec == self_st.st_mtim.tv_sec) {
                        if (bin_st.st_mtim.tv_nsec < self_st.st_mtim.tv_nsec) {
                                old = true;
                        }
                }
		size_t bin_sz = bin_st.st_size;
#else
                WIN32_FIND_DATA bin_st;
		HANDLE finder = FindFirstFile(bin_path.c_str(), &bin_st);
		FindClose(finder);

		bool old = false;
		uint64_t self_time = (((uint64_t)self_st.ftLastWriteTime.dwHighDateTime)<<32) |
			((uint64_t)self_st.ftLastWriteTime.dwLowDateTime);
		uint64_t bin_time = (((uint64_t)bin_st.ftLastWriteTime.dwHighDateTime)<<32) |
			((uint64_t)bin_st.ftLastWriteTime.dwLowDateTime);

		if (bin_time < self_time) {
			old = true;
		}

		size_t bin_sz = bin_st.nFileSizeLow;
#endif

                if (!old) {
                        unsigned char *bin = (unsigned char*)malloc(bin_sz);

                        size_t rem = bin_sz;
                        unsigned char *p = bin;
                        while (rem) {
                                size_t rsz = fread(p, 1, rem, binfp);
                                if (rsz <= 0) {
                                        break;
                                }

                                rem -= rsz;
                                p += rsz;
                        }

                        if (rem == 0) {
                                cl_int err;
                                program = clCreateProgramWithBinary(context, 1, &dev, &bin_sz,
                                                                    (const unsigned char**)&bin, NULL, &err);

                                if (err == CL_SUCCESS) {
                                        bin_avaiable = true;
                                }
                        }

                        free(bin);
                }

                fclose(binfp);
        }
#endif

        if (! bin_avaiable) {
                const char *source[1] = {prog};
                size_t src_len[1] = {sizeof(prog)-1};

                program = clCreateProgramWithSource(context, 1, source, src_len, &err);
                if (err != CL_SUCCESS) {
                        clReleaseContext(context);
                        return false;
                }

        }

#if defined __linux || defined _WIN32
        free(self_path);
#endif

        err = clBuildProgram(program, 1, &dev, "" , nullptr, nullptr);
        if (err != CL_SUCCESS) {
                size_t log_len;
                clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_len);

                std::vector<char> log(log_len+1);
                clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_len, &log[0], &log_len);
                log[log_len] = '\0';

                puts(&log[0]);

                clReleaseProgram(program);
                clReleaseContext(context);
                return false;
        }



#if defined __linux || _WIN32
        if (!bin_avaiable) {
                size_t binsz;
                size_t ret_len;
                clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsz), &binsz, &ret_len);

                char *buffer = new char [binsz];
                char *ptrs[1];
                ptrs[0] = buffer;

                clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(ptrs), ptrs, &ret_len);

                FILE *fp = fopen(bin_path.c_str(), "wb");

                size_t rem = binsz;
                char *p = buffer;

                while (rem) {
                        size_t wsz = fwrite(p, 1, rem, fp);
                        if (wsz <= 0) {
                                fclose(fp);
                                unlink(bin_path.c_str());
                                fp=NULL;
                                break;
                        }
                        rem -= wsz;
                        p += wsz;
                }

                if (fp) {
                        fclose(fp);
                }

                delete [] buffer;
        }
#endif



        ker_filter = clCreateKernel(program, "filter", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                return false;
        }

        ker_filter_in1_out32 = clCreateKernel(program, "filter_in1_out32", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                return false;
        }

        ker_filter_in3_out32 = clCreateKernel(program, "filter_in3_out32", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                return false;
        }

        ker_filter_in128_out1 = clCreateKernel(program, "filter_in128_out1", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                return false;
        }

        ker_filter_in128_out3 = clCreateKernel(program, "filter_in128_out3", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                return false;
        }

        queue = clCreateCommandQueue(context, dev, 0, &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                return false;
        }

        env->num_cl_dev = 1;
        env->cl_dev_list = new OpenCLDev[1];

        env->cl_dev_list[0].platform = platform;
        env->cl_dev_list[0].context = context;
        env->cl_dev_list[0].devid = dev;
        env->cl_dev_list[0].queue = queue;
        env->cl_dev_list[0].program = program;
        env->cl_dev_list[0].ker_filter = ker_filter;
        env->cl_dev_list[0].ker_filter_in1_out32 = ker_filter_in1_out32;
        env->cl_dev_list[0].ker_filter_in128_out1 = ker_filter_in128_out1;
        env->cl_dev_list[0].ker_filter_in3_out32 = ker_filter_in3_out32;
        env->cl_dev_list[0].ker_filter_in128_out3 = ker_filter_in128_out3;
        env->cl_dev_list[0].name = &dev_name[0];

        return true;
}
Esempio n. 19
0
int 
exec_dot_product_kernel(const char *program_source, size_t source_size,
                        int n, cl_float4 *srcA, cl_float4 *srcB, cl_float *dst) 
{ 
  cl_context  context; 
  cl_command_queue cmd_queue; 
  cl_device_id  *devices; 
  cl_program  program; 
  cl_kernel  kernel; 
  cl_mem       memobjs[3]; 
  size_t       global_work_size[1]; 
  size_t       local_work_size[1]; 
  size_t       cb; 
  cl_int       err; 
  int          i;
  context = poclu_create_any_context();
  if (context == (cl_context)0) 
    return -1; 
 
  // get the list of GPU devices associated with context 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
  devices = (cl_device_id *) malloc(cb); 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
 
  // create a command-queue 
  cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (cmd_queue == (cl_command_queue)0) 
    { 
      clReleaseContext(context); 
      free(devices); 
      return -1; 
    } 

  for (i = 0; i < n; ++i)
    {
       poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4);
       poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4);
    }

 
  // allocate the buffer memory objects 
  memobjs[0] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcA, NULL); 
  if (memobjs[0] == (cl_mem)0) 
    { 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  memobjs[1] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcB, NULL); 
  if (memobjs[1] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 1); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1;
    } 
 
  memobjs[2] = clCreateBuffer(context, 
			      CL_MEM_READ_WRITE, 
			      sizeof(cl_float) * n, NULL, NULL); 
  if (memobjs[2] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the program 
  program = 
    clCreateProgramWithBinary
    (context, 1, devices, &source_size, 
     (const unsigned char**)&program_source, NULL, NULL); 
  if (program == (cl_program)0) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // build the program 
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the kernel 
  kernel = clCreateKernel(program, "dot_product", NULL); 
  if (kernel == (cl_kernel)0) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set the args values 
  err = clSetKernelArg(kernel,  0,  
		       sizeof(cl_mem), (void *) &memobjs[0]); 
  err |= clSetKernelArg(kernel, 1,  
			sizeof(cl_mem), (void *) &memobjs[1]); 
  err |= clSetKernelArg(kernel, 2,
			sizeof(cl_mem), (void *) &memobjs[2]); 
 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set work-item dimensions 
  global_work_size[0] = n; 
  local_work_size[0]= 128; 
 
  // execute kernel 
  err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, 
			       global_work_size, local_work_size,  
			       0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // read output image 
  err = clEnqueueReadBuffer(cmd_queue, memobjs[2], CL_TRUE, 
			    0, n * sizeof(cl_float), dst, 
			    0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
  for (i = 0; i < n; ++i)
    {
      poclu_bswap_cl_float_array(devices[0], (cl_float*)&dst[i], 1);
      poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4);
      poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4);
    }
  free(devices); 


  // release kernel, program, and memory objects 
  delete_memobjs(memobjs, 3); 
  clReleaseKernel(kernel); 
  clReleaseProgram(program); 
  clReleaseCommandQueue(cmd_queue); 
  clReleaseContext(context); 
  return 0; // success... 
}
Esempio n. 20
0
int
main(int argc, char **argv)
{
    cl_uint num;
    cl_int err;
    int platform_idx = -1;
    cl_platform_id *plat_ids;
    int i;
    size_t sz;
    cl_device_id *gpu_devs;
    cl_context_properties cps[3];
    cl_context context;
    int opt;
    char *input;
    int run_size = 1024;
    struct AIISA_Program prog;
    cl_command_queue queue;
    int ei;
    int nloop = 16;
    struct AIISA_CodeBuffer buf;

    aiisa_code_buffer_init(&buf);

    clGetPlatformIDs(0, NULL, &num);

    plat_ids = (cl_platform_id*)malloc(sizeof(*plat_ids) * num);
    clGetPlatformIDs(num, plat_ids, NULL);

    while ((opt = getopt(argc, argv, "n:")) != -1) {
        switch (opt) {
        case 'n':
            run_size = atoi(optarg);
            break;

        default:
            puts("usage : run in.cl");
            return 1;
        }
    }

    if (optind >= argc) {
        puts("usage : run in.cl");
        return 1;
    }

    input = argv[optind];

    for (i=0; i<(int)num; i++) {
        char name[1024];
        size_t len;
        clGetPlatformInfo(plat_ids[i], CL_PLATFORM_VENDOR, sizeof(name), name, &len);

        //puts(name);
        if (strcmp(name, "Advanced Micro Devices, Inc.") == 0) {
            platform_idx = i;
            break;
        }
    }

    if (platform_idx == -1) {
        puts("no amd");
        return -1;
    }

    clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &num);
    if (num == 0) {
        puts("no gpu");
        return -1;
    }

    gpu_devs = (cl_device_id*)malloc(sizeof(gpu_devs[0]) * 1);
    //clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, num, gpu_devs, NULL);

    cps[0] = CL_CONTEXT_PLATFORM;
    cps[1] = (cl_context_properties)plat_ids[platform_idx];
    cps[2] = 0;

    context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
    clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(gpu_devs), gpu_devs, &sz);

    queue = clCreateCommandQueue(context, gpu_devs[0], 0, NULL);

    {
        char name[1024];
        size_t sz;
        clGetDeviceInfo(gpu_devs[0], CL_DEVICE_NAME, sizeof(name), name, &sz);

        puts(name);
    }

    //puts(input);

    aiisa_build_binary_from_cl(&prog, context, gpu_devs[0], input);

    for (ei=0; ei<nloop; ei++) {
        cl_program cl_prog;
        const unsigned char *bin[1];
        size_t bin_size[1];
        cl_kernel ker;
        cl_mem in, out;
        size_t global_size[3];
        double tb, te;

        tb = sec();
        gen_code(&prog, &buf);
        bin[0] = prog.cl_binary;
        bin_size[0] = prog.size;
        cl_prog = clCreateProgramWithBinary(context, 1, gpu_devs, bin_size, bin, NULL, NULL);
        clBuildProgram(cl_prog, 1, gpu_devs, NULL, NULL, NULL);
        ker = clCreateKernel(cl_prog, "f", &err);
        te = sec();
        printf("build : %f[usec]\n", (te-tb)*1000000);

        in = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err);
        out = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err);

        clSetKernelArg(ker, 0, sizeof(cl_mem), &in);
        clSetKernelArg(ker, 1, sizeof(cl_mem), &out);


        {
            int *ptr = (int*)clEnqueueMapBuffer(queue, in, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                ptr[i] = i;
            }
            clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL);
        }

        {
            int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                ptr[i] = 0xdeadbeef;
            }
            clEnqueueUnmapMemObject(queue, out, ptr, 0, NULL, NULL);
        }

        err = clFinish(queue);

        global_size[0] = run_size;
        err = clEnqueueNDRangeKernel(queue, ker, 1, NULL, global_size, NULL, 0, NULL, NULL);
        if (err != CL_SUCCESS) {
            puts("enqueue nd");
        }
        err = clFinish(queue);
        if (err != CL_SUCCESS) {
            puts("fini");
        }

        if (ei == 0) {
            int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_READ, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                printf("%d : %x\n", i, ptr[i]);
            }
            clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL);
        }

        err = clFinish(queue);

        clReleaseMemObject(in);
        clReleaseMemObject(out);
        clReleaseKernel(ker);
        clReleaseProgram(cl_prog);
    }

    return 0;
}
bool
initOpenCL(W2XConv *c, ComputeEnv *env, W2XConvProcessor *proc)
{
        int dev_id = proc->dev_id;
        env->num_cl_dev = 1;
        env->cl_dev_list = new OpenCLDev[1];
        const OpenCLDevListEntry *de = &dev_list[dev_id];
        cl_int err;
        cl_device_id dev = de->dev;
        cl_context_properties props[] =
                {CL_CONTEXT_PLATFORM, (cl_context_properties)(de->plt_id), 0};
        cl_context context = clCreateContext(props, 1, &dev, NULL, NULL, &err);
        if (err != CL_SUCCESS) {
                setCLError(c, dev_id, err);
                return false;
        }

        if (proc->sub_type == W2XCONV_PROC_OPENCL_INTEL_GPU) {
                env->pref_block_size = 256;
        }

        cl_command_queue queue;
        cl_kernel ker_filter, ker_filter_in1_out32, ker_filter_in128_out1;
        cl_kernel ker_filter_in3_out32, ker_filter_in128_out3;
        cl_program program = 0;

        const char *dev_name = proc->dev_name;
        bool bin_avaiable = false;

#if ((defined __linux) && !(defined __ANDROID__)) || _WIN32
#define GENERATE_BINARY
#endif


#ifdef GENERATE_BINARY
#ifdef __linux
        ssize_t path_len = 4;
        char *self_path = (char*)malloc(path_len+1);
        while (1) {
                ssize_t r = readlink("/proc/self/exe", self_path, path_len);
                if (r < path_len) {
                        self_path[r] = '\0';
                        break;
                }

                path_len *= 2;
                self_path = (char*)realloc(self_path, path_len+1);
        }

        struct stat self_st;
        stat(self_path, &self_st);
        self_path = dirname(self_path);
#else
        size_t path_len = 4;
        char *self_path = (char*)malloc(path_len+1);
	DWORD len;
        while (1) {
		len = GetModuleFileName(NULL, self_path, path_len);
		if (len > 0 && len != path_len) {
			break;
		}

                path_len *= 2;
                self_path = (char*)realloc(self_path, path_len+1);
        }
	WIN32_FIND_DATA self_st;
	HANDLE finder = FindFirstFile(self_path, &self_st);
	FindClose(finder);

	for (int si=len-1; si>=0; si--) {
		if (self_path[si] == '\\') {
			self_path[si] = '\0';
			break;
		}
	}
#endif

        std::string bin_path = std::string(self_path) + "/" + &dev_name[0] + ".bin";

        FILE *binfp = fopen(bin_path.c_str(), "rb");
        if (binfp) {
#if (defined __linux)
                struct stat bin_st;
                stat(bin_path.c_str(), &bin_st);

                bool old = false;
                if (bin_st.st_mtim.tv_sec < self_st.st_mtim.tv_sec) {
                        old = true;
                }

                if (bin_st.st_mtim.tv_sec == self_st.st_mtim.tv_sec) {
                        if (bin_st.st_mtim.tv_nsec < self_st.st_mtim.tv_nsec) {
                                old = true;
                        }
                }
		size_t bin_sz = bin_st.st_size;
#else
                WIN32_FIND_DATA bin_st;
		HANDLE finder = FindFirstFile(bin_path.c_str(), &bin_st);
		FindClose(finder);

		bool old = false;
		uint64_t self_time = (((uint64_t)self_st.ftLastWriteTime.dwHighDateTime)<<32) |
			((uint64_t)self_st.ftLastWriteTime.dwLowDateTime);
		uint64_t bin_time = (((uint64_t)bin_st.ftLastWriteTime.dwHighDateTime)<<32) |
			((uint64_t)bin_st.ftLastWriteTime.dwLowDateTime);

		if (bin_time < self_time) {
			old = true;
		}

		size_t bin_sz = bin_st.nFileSizeLow;
#endif

                if (!old) {
                        unsigned char *bin = (unsigned char*)malloc(bin_sz);

                        size_t rem = bin_sz;
                        unsigned char *p = bin;
                        while (rem) {
                                size_t rsz = fread(p, 1, rem, binfp);
                                if (rsz <= 0) {
                                        break;
                                }

                                rem -= rsz;
                                p += rsz;
                        }

                        if (rem == 0) {
                                cl_int err;
                                program = clCreateProgramWithBinary(context, 1, &dev, &bin_sz,
                                                                    (const unsigned char**)&bin, NULL, &err);

                                if (err == CL_SUCCESS) {
                                        bin_avaiable = true;
                                }
                        }

                        free(bin);
                }

                fclose(binfp);
        }
#endif

        if (! bin_avaiable) {
                const char *source[1] = {prog};
                size_t src_len[1] = {sizeof(prog)-1};

                program = clCreateProgramWithSource(context, 1, source, src_len, &err);
                if (err != CL_SUCCESS) {
                        clReleaseContext(context);
                        setCLError(c, dev_id, err);
                        return false;
                }

        }

#ifdef GENERATE_BINARY
        free(self_path);
#endif

        err = clBuildProgram(program, 1, &dev, "" , nullptr, nullptr);
        if (err != CL_SUCCESS) {
                size_t log_len;
                clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_len);

                std::vector<char> log(log_len+1);
                clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_len, &log[0], &log_len);
                log[log_len] = '\0';

                puts(&log[0]);

                clReleaseProgram(program);
                clReleaseContext(context);
                setCLError(c, dev_id, err);
                return false;
        }



#ifdef GENERATE_BINARY
        if (!bin_avaiable) {
                size_t binsz;
                size_t ret_len;
                clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsz), &binsz, &ret_len);

                char *buffer = new char [binsz];
                char *ptrs[1];
                ptrs[0] = buffer;

                clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(ptrs), ptrs, &ret_len);

                FILE *fp = fopen(bin_path.c_str(), "wb");

                size_t rem = binsz;
                char *p = buffer;

                while (rem) {
                        size_t wsz = fwrite(p, 1, rem, fp);
                        if (wsz <= 0) {
                                fclose(fp);
                                unlink(bin_path.c_str());
                                fp=NULL;
                                break;
                        }
                        rem -= wsz;
                        p += wsz;
                }

                if (fp) {
                        fclose(fp);
                }

                delete [] buffer;
        }
#endif



        ker_filter = clCreateKernel(program, "filter", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                setCLError(c, dev_id, err);
                return false;
        }

        ker_filter_in1_out32 = clCreateKernel(program, "filter_in1_out32", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                setCLError(c, dev_id, err);
                return false;
        }

        ker_filter_in3_out32 = clCreateKernel(program, "filter_in3_out32", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                setCLError(c, dev_id, err);
                return false;
        }

        ker_filter_in128_out1 = clCreateKernel(program, "filter_in128_out1", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                setCLError(c, dev_id, err);
                return false;
        }

        ker_filter_in128_out3 = clCreateKernel(program, "filter_in128_out3", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                setCLError(c, dev_id, err);
                return false;
        }

        queue = clCreateCommandQueue(context, dev, 0, &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                setCLError(c, dev_id, err);
                return false;
        }

        env->num_cl_dev = 1;
        env->cl_dev_list = new OpenCLDev[1];

        env->cl_dev_list[0].platform = de->plt_id;
        env->cl_dev_list[0].context = context;
        env->cl_dev_list[0].devid = dev;
        env->cl_dev_list[0].queue = queue;
        env->cl_dev_list[0].program = program;
        env->cl_dev_list[0].ker_filter = ker_filter;
        env->cl_dev_list[0].ker_filter_in1_out32 = ker_filter_in1_out32;
        env->cl_dev_list[0].ker_filter_in128_out1 = ker_filter_in128_out1;
        env->cl_dev_list[0].ker_filter_in3_out32 = ker_filter_in3_out32;
        env->cl_dev_list[0].ker_filter_in128_out3 = ker_filter_in128_out3;
        env->cl_dev_list[0].name = &dev_name[0];

        return true;
}
Esempio n. 22
0
static void create_program_from_bitcode(char* bitcode_path) {
  cl_int err;
  unsigned int i;
  
  // Instead of passing actual executable bits, we pass a path to the
  // already-compiled bitcode to clCreateProgramWithBinary.  Note that
  // you may load bitcode for multiple devices in one call by passing
  // multiple paths and multiple devices.  In the multiple-device case, 
  // the indices should match: if device 0 is a 32-bit GPU, then path 0 
  // should be bitcode for a GPU.  In the example below, we are loading
  // bitcode for one device only.
  
  size_t len = strlen(bitcode_path);
  program = clCreateProgramWithBinary(context, 1, &device, &len,
    (const unsigned char**)&bitcode_path, NULL, &err);
  check_status("clCreateProgramWithBinary", err);
  
  // The above tells OpenCL how to locate the intermediate bitcode, but we
  // still must build the program to produce executable bits for our
  // *specific* device.  This transforms gpu32 bitcode into actual executable
  // bits for an AMD or Intel compute device (for example).
  
  err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
  check_status("clBuildProgram", err);
  
  // And that's it -- we have a fully-compiled program created from the 
  // bitcode.  Let's ask OpenCL for the test kernel.
  
  kernel = clCreateKernel(program, "vecadd", &err);
  check_status("clCreateKernel", err);
  
  // And now, let's test the kernel with some dummy data.
  
  float *host_a = (float*)malloc(sizeof(float)*4*NELEMENTS);
  float *host_b = (float*)malloc(sizeof(float)*4*NELEMENTS);
  float *host_c = (float*)malloc(sizeof(float)*4*NELEMENTS);
  
  // We pack some host buffers with our data.
  
  for (i = 0; i < NELEMENTS; i++) {
    host_a[i*4+0] = host_b[i*4+0] = i;
    host_a[i*4+1] = host_b[i*4+1] = i;
    host_a[i*4+2] = host_b[i*4+2] = i;
    host_a[i*4+3] = host_b[i*4+3] = i;
  }
  
  // And create and load some CL memory buffers with that host data.
  
  cl_mem a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
    sizeof(cl_float4)*NELEMENTS, host_a, &err);
  
  cl_mem b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
    sizeof(cl_float4)*NELEMENTS, host_b, &err);
  
  // CL buffer 'c' is for output, so we don't prepopulate it with data.
  
  cl_mem c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
    sizeof(cl_float4)*NELEMENTS, NULL, &err);
  
  if (a == NULL || b == NULL || c == NULL) {
    fprintf(stderr, "Error: Unable to create OpenCL buffer memory objects.\n");
    exit(1);
  }
  
  // We set the CL buffers as arguments for the 'vecadd' kernel.
  
  int argc = 0;
  err |= clSetKernelArg(kernel, argc++, sizeof(cl_mem), &a);
  err |= clSetKernelArg(kernel, argc++, sizeof(cl_mem), &b);
  err |= clSetKernelArg(kernel, argc++, sizeof(cl_mem), &c);
  check_status("clSetKernelArg", err);
  
  // Launch the kernel over a single dimension, which is the same size
  // as the number of float4s.  We let OpenCL select the local dimensions
  // by passing 'NULL' as the 6th parameter.
  
  size_t global = NELEMENTS;
  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, 
    NULL);
  check_status("clEnqueueNDRangeKernel", err);
  
  // Read back the results (blocking, so everything finishes), and then 
  // validate the results.
  
  clEnqueueReadBuffer(queue, c, CL_TRUE, 0, NELEMENTS*sizeof(cl_float4), host_c, 
    0, NULL, NULL);
  
  int success = 1;
  for (i = 0; i < NELEMENTS; i++) {
    if ( host_c[i*4+0] != i*2.0 || host_c[i*4+1] != i * 2.0 ||
         host_c[i*4+2] != i*2.0 || host_c[i*4+3] != i * 2.0 ) 
    {
      success = 0;
      fprintf(stderr, "Validation failed at index %d\n", i);
      fprintf(stderr, "Kernel FAILED!\n");
      break;
    }
  }
  
  if (success) {
    fprintf(stdout, "Validation successful.\n");
  }
}
Esempio n. 23
0
_clState *initCl(unsigned int gpu, char *name, size_t nameSize)
{
	_clState *clState = calloc(1, sizeof(_clState));
	bool patchbfi = false, prog_built = false;
	struct cgpu_info *cgpu = &gpus[gpu];
	cl_platform_id platform = NULL;
	char pbuff[256], vbuff[255];
	cl_platform_id* platforms;
	cl_uint preferred_vwidth;
	cl_device_id *devices;
	cl_uint numPlatforms;
	cl_uint numDevices;
	cl_int status;

	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status);
		return NULL;
	}

	platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
	status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status);
		return NULL;
	}

	if (opt_platform_id >= (int)numPlatforms) {
		applog(LOG_ERR, "Specified platform that does not exist");
		return NULL;
	}

	status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status);
		return NULL;
	}
	platform = platforms[opt_platform_id];

	if (platform == NULL) {
		perror("NULL platform found!\n");
		return NULL;
	}

	applog(LOG_INFO, "CL Platform vendor: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform name: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform version: %s", vbuff);

	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status);
		return NULL;
	}

	if (numDevices > 0 ) {
		devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

		/* Now, get the device list data */

		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status);
			return NULL;
		}

		applog(LOG_INFO, "List of devices:");

		unsigned int i;
		for (i = 0; i < numDevices; i++) {
			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error %d: Getting Device Info", status);
				return NULL;
			}

			applog(LOG_INFO, "\t%i\t%s", i, pbuff);
		}

		if (gpu < numDevices) {
			status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error %d: Getting Device Info", status);
				return NULL;
			}

			applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
			strncpy(name, pbuff, nameSize);
		} else {
			applog(LOG_ERR, "Invalid GPU %i", gpu);
			return NULL;
		}

	} else return NULL;

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };

	clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status);
		return NULL;
	}

	/////////////////////////////////////////////////////////////////
	// Create an OpenCL command queue
	/////////////////////////////////////////////////////////////////
	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
	if (status != CL_SUCCESS) /* Try again without OOE enable */
		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
		return NULL;
	}

	/* Check for BFI INT support. Hopefully people don't mix devices with
	 * and without it! */
	char * extensions = malloc(1024);
	const char * camo = "cl_amd_media_ops";
	char *find;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status);
		return NULL;
	}
	find = strstr(extensions, camo);
	if (find)
		clState->hasBitAlign = true;
		
	/* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
	char * devoclver = malloc(1024);
	const char * ocl10 = "OpenCL 1.0";

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status);
		return NULL;
	}
	find = strstr(devoclver, ocl10);
	if (!find)
		clState->hasOpenCL11plus = true;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Max mem alloc size is %u", cgpu->max_alloc);

	/* Create binary filename based on parameters passed to opencl
	 * compiler to ensure we only load a binary that matches what would
	 * have otherwise created. The filename is:
	 * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
	 * For scrypt the filename is:
	 * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin
	 */
	char binaryfilename[255];
	char filename[255];
	char numbuf[16];

	if (cgpu->kernel == KL_NONE) {
		if (opt_scrypt) {
			applog(LOG_INFO, "Selecting scrypt kernel");
			clState->chosen_kernel = KL_SCRYPT;
		} else if (!strstr(name, "Tahiti") &&
			/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
			 strstr(vbuff, "831.4") ||
			 strstr(vbuff, "898.1") ||  // 12.2 driver SDK 
			 strstr(vbuff, "923.1") ||  // 12.4
			 strstr(vbuff, "938.2") ||  // SDK 2.7
			 strstr(vbuff, "1113.2"))) {// SDK 2.8
				applog(LOG_INFO, "Selecting diablo kernel");
				clState->chosen_kernel = KL_DIABLO;
		/* Detect all 7970s, older ATI and NVIDIA and use poclbm */
		} else if (strstr(name, "Tahiti") || !clState->hasBitAlign) {
			applog(LOG_INFO, "Selecting poclbm kernel");
			clState->chosen_kernel = KL_POCLBM;
		/* Use phatk for the rest R5xxx R6xxx */
		} else {
			applog(LOG_INFO, "Selecting phatk kernel");
			clState->chosen_kernel = KL_PHATK;
		}
		cgpu->kernel = clState->chosen_kernel;
	} else {
		clState->chosen_kernel = cgpu->kernel;
		if (clState->chosen_kernel == KL_PHATK &&
		    (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") ||
		     strstr(vbuff, "831.4") || strstr(vbuff, "898.1") ||
		     strstr(vbuff, "923.1") || strstr(vbuff, "938.2") ||
		     strstr(vbuff, "1113.2"))) {
			applog(LOG_WARNING, "WARNING: You have selected the phatk kernel.");
			applog(LOG_WARNING, "You are running SDK 2.6+ which performs poorly with this kernel.");
			applog(LOG_WARNING, "Downgrade your SDK and delete any .bin files before starting again.");
			applog(LOG_WARNING, "Or allow cgminer to automatically choose a more suitable kernel.");
		}
	}

	/* For some reason 2 vectors is still better even if the card says
	 * otherwise, and many cards lie about their max so use 256 as max
	 * unless explicitly set on the command line. Tahiti prefers 1 */
	if (strstr(name, "Tahiti"))
		preferred_vwidth = 1;
	else if (preferred_vwidth > 2)
		preferred_vwidth = 2;

	switch (clState->chosen_kernel) {
		case KL_POCLBM:
			strcpy(filename, POCLBM_KERNNAME".cl");
			strcpy(binaryfilename, POCLBM_KERNNAME);
			break;
		case KL_PHATK:
			strcpy(filename, PHATK_KERNNAME".cl");
			strcpy(binaryfilename, PHATK_KERNNAME);
			break;
		case KL_DIAKGCN:
			strcpy(filename, DIAKGCN_KERNNAME".cl");
			strcpy(binaryfilename, DIAKGCN_KERNNAME);
			break;
		case KL_SCRYPT:
			strcpy(filename, SCRYPT_KERNNAME".cl");
			strcpy(binaryfilename, SCRYPT_KERNNAME);
			/* Scrypt only supports vector 1 */
			cgpu->vwidth = 1;
			break;
		case KL_NONE: /* Shouldn't happen */
		case KL_DIABLO:
			strcpy(filename, DIABLO_KERNNAME".cl");
			strcpy(binaryfilename, DIABLO_KERNNAME);
			break;
	}

	if (cgpu->vwidth)
		clState->vwidth = cgpu->vwidth;
	else {
		clState->vwidth = preferred_vwidth;
		cgpu->vwidth = preferred_vwidth;
	}

	if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
		clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt)
			clState->goffset = true;

	if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
		clState->wsize = cgpu->work_size;
	else if (strstr(name, "Tahiti"))
		clState->wsize = 64;
	else
		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
	cgpu->work_size = clState->wsize;

#ifdef USE_SCRYPT
	if (opt_scrypt) {
		if (!cgpu->opt_lg) {
			applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
			cgpu->lookup_gap = 2;
		} else
			cgpu->lookup_gap = cgpu->opt_lg;

		if (!cgpu->opt_tc) {
			unsigned int sixtyfours;

			sixtyfours =  cgpu->max_alloc / 131072 / 64 - 1;
			cgpu->thread_concurrency = sixtyfours * 64;
			if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
				cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
				if (cgpu->thread_concurrency > cgpu->shaders * 5)
					cgpu->thread_concurrency = cgpu->shaders * 5;
			}
			applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu,  cgpu->thread_concurrency);
		} else
			cgpu->thread_concurrency = cgpu->opt_tc;
	}
#endif

	FILE *binaryfile;
	size_t *binary_sizes;
	char **binaries;
	int pl;
	char *source = file_contents(filename, &pl);
	size_t sourceSize[] = {(size_t)pl};
	cl_uint slot, cpnd;

	slot = cpnd = 0;

	if (!source)
		return NULL;

	binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binary_sizes)) {
		applog(LOG_ERR, "Unable to calloc binary_sizes");
		return NULL;
	}
	binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binaries)) {
		applog(LOG_ERR, "Unable to calloc binaries");
		return NULL;
	}

	strcat(binaryfilename, name);
	if (clState->goffset)
		strcat(binaryfilename, "g");
	if (opt_scrypt) {
#ifdef USE_SCRYPT
		sprintf(numbuf, "lg%utc%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency);
		strcat(binaryfilename, numbuf);
#endif
	} else {
		sprintf(numbuf, "v%d", clState->vwidth);
		strcat(binaryfilename, numbuf);
	}
	sprintf(numbuf, "w%d", (int)clState->wsize);
	strcat(binaryfilename, numbuf);
	sprintf(numbuf, "l%d", (int)sizeof(long));
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, ".bin");

	binaryfile = fopen(binaryfilename, "rb");
	if (!binaryfile) {
		applog(LOG_DEBUG, "No binary found, generating from source");
	} else {
		struct stat binary_stat;

		if (unlikely(stat(binaryfilename, &binary_stat))) {
			applog(LOG_DEBUG, "Unable to stat binary, generating from source");
			fclose(binaryfile);
			goto build;
		}
		if (!binary_stat.st_size)
			goto build;

		binary_sizes[slot] = binary_stat.st_size;
		binaries[slot] = (char *)calloc(binary_sizes[slot], 1);
		if (unlikely(!binaries[slot])) {
			applog(LOG_ERR, "Unable to calloc binaries");
			fclose(binaryfile);
			return NULL;
		}

		if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) {
			applog(LOG_ERR, "Unable to fread binaries");
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		fclose(binaryfile);
		applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);

		goto built;
	}

	/////////////////////////////////////////////////////////////////
	// Load CL file, build CL program object, create CL kernel object
	/////////////////////////////////////////////////////////////////

build:
	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
		return NULL;
	}

	/* create a cl program executable for all the devices specified */
	char *CompilerOptions = calloc(1, 256);

#ifdef USE_SCRYPT
	if (opt_scrypt)
		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
			cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize);
	else
#endif
	{
		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
			(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
	}
	applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize);
	if (clState->vwidth > 1)
		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);

	if (clState->hasBitAlign) {
		strcat(CompilerOptions, " -D BITALIGN");
		applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
		if (strstr(name, "Cedar") ||
		    strstr(name, "Redwood") ||
		    strstr(name, "Juniper") ||
		    strstr(name, "Cypress" ) ||
		    strstr(name, "Hemlock" ) ||
		    strstr(name, "Caicos" ) ||
		    strstr(name, "Turks" ) ||
		    strstr(name, "Barts" ) ||
		    strstr(name, "Cayman" ) ||
		    strstr(name, "Antilles" ) ||
		    strstr(name, "Wrestler" ) ||
		    strstr(name, "Zacate" ) ||
		    strstr(name, "WinterPark" ))
			patchbfi = true;
	} else
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");

	if (patchbfi) {
		strcat(CompilerOptions, " -D BFI_INT");
		applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
	} else
		applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");

	if (clState->goffset)
		strcat(CompilerOptions, " -D GOFFSET");

	if (!clState->hasOpenCL11plus)
		strcat(CompilerOptions, " -D OCL1");

	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
	status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);
	free(CompilerOptions);

	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_ERR, "%s", log);
		return NULL;
	}

	prog_built = true;

	status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
		return NULL;
	}

	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
		return NULL;
	}

	/* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
	 * to iterate over all the binary slots and find where the real program
	 * is. What the heck is this!? */
	for (slot = 0; slot < cpnd; slot++)
		if (binary_sizes[slot])
			break;

	/* copy over all of the generated binaries. */
	applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, binary_sizes[slot]);
	if (!binary_sizes[slot]) {
		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
		return NULL;
	}
	binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1);
	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
		return NULL;
	}

	/* Patch the kernel if the hardware supports BFI_INT but it needs to
	 * be hacked in */
	if (patchbfi) {
		unsigned remaining = binary_sizes[slot];
		char *w = binaries[slot];
		unsigned int start, length;

		/* Find 2nd incidence of .text, and copy the program's
		* position and length at a fixed offset from that. Then go
		* back and find the 2nd incidence of \x7ELF (rewind by one
		* from ELF) and then patch the opcocdes */
		if (!advance(&w, &remaining, ".text"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, ".text")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		memcpy(&start, w + 285, 4);
		memcpy(&length, w + 289, 4);
		w = binaries[slot]; remaining = binary_sizes[slot];
		if (!advance(&w, &remaining, "ELF"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, "ELF")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		w--; remaining++;
		w += start; remaining -= start;
		applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
			w, remaining);
		patch_opcodes(w, length);

		status = clReleaseProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
			return NULL;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
			return NULL;
		}

		/* Program needs to be rebuilt */
		prog_built = false;
	}

	free(source);

	/* Save the binary to be loaded next time */
	binaryfile = fopen(binaryfilename, "wb");
	if (!binaryfile) {
		/* Not a fatal problem, just means we build it again next time */
		applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
	} else {
		if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) {
			applog(LOG_ERR, "Unable to fwrite to binaryfile");
			return NULL;
		}
		fclose(binaryfile);
	}
built:
	if (binaries[slot])
		free(binaries[slot]);
	free(binaries);
	free(binary_sizes);

	applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d",
	       filename, clState->hasBitAlign ? "" : "out", clState->vwidth, clState->wsize);

	if (!prog_built) {
		/* create a cl program executable for all the devices specified */
		status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
			size_t logSize;
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

			char *log = malloc(logSize);
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
			applog(LOG_ERR, "%s", log);
			return NULL;
		}
	}

	/* get a kernel object handle for a kernel with the given name */
	clState->kernel = clCreateKernel(clState->program, "search", &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status);
		return NULL;
	}

#ifdef USE_SCRYPT
	if (opt_scrypt) {
		size_t ipt = (2048 / cgpu->lookup_gap + (2048 % cgpu->lookup_gap > 0));
		size_t bufsize = 128 * ipt * cgpu->thread_concurrency;

		/* Use the max alloc value which has been rounded to a power of
		 * 2 greater >= required amount earlier */
		if (bufsize > cgpu->max_alloc) {
			applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u", gpu, cgpu->max_alloc);
			applog(LOG_WARNING, "Your scrypt settings come to %u", bufsize);
		}
		applog(LOG_DEBUG, "Creating scrypt buffer sized %u", bufsize);
		clState->padbufsize = bufsize;

		/* This buffer is weird and might work to some degree even if
		 * the create buffer call has apparently failed, so check if we
		 * get anything back before we call it a failure. */
		clState->padbuffer8 = NULL;
		clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
		if (status != CL_SUCCESS && !clState->padbuffer8) {
			applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease CT or increase LG", status);
			return NULL;
		}

		clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
			return NULL;
		}
	}
#endif
	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
		return NULL;
	}

	return clState;
}
int
BinomialOption::setupCL()
{
    cl_int status = CL_SUCCESS;
    size_t deviceListSize;
    
    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetPlatformIDs failed."))
    {
        return SDK_FAILURE;
    }
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetPlatformIDs failed."))
        {
            return SDK_FAILURE;
        }
        if(isPlatformEnabled())
        {
            platform = platforms[platformId];
        }
        else
        {
            for (unsigned i = 0; i < numPlatforms; ++i) 
            {
                char pbuf[100];
                status = clGetPlatformInfo(platforms[i],
                                           CL_PLATFORM_VENDOR,
                                           sizeof(pbuf),
                                           pbuf,
                                           NULL);

                if(!sampleCommon->checkVal(status,
                                           CL_SUCCESS,
                                           "clGetPlatformInfo failed."))
                {
                    return SDK_FAILURE;
                }

                platform = platforms[i];
                if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
                {
                    break;
                }
            }
        }
        delete[] platforms;
    }

    if(NULL == platform)
    {
        sampleCommon->error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    // Display available devices.
    if(!sampleCommon->displayDevices(platform, dType))
    {
        sampleCommon->error("sampleCommon::displayDevices() failed");
        return SDK_FAILURE;
    }

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateContextFromType failed."))
    {
        return SDK_FAILURE;
    }

    /* First, get the size of device list data */
    status = clGetContextInfo(context, 
                              CL_CONTEXT_DEVICES, 
                              0, 
                              NULL, 
                              &deviceListSize);
    if(!sampleCommon->checkVal(status, 
                               CL_SUCCESS,
                               "clGetContextInfo failed."))
    {
        return SDK_FAILURE;
    }

    int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
    if(!sampleCommon->validateDeviceId(deviceId, deviceCount))
    {
        sampleCommon->error("sampleCommon::validateDeviceId() failed");
        return SDK_FAILURE;
    }

    /* Now allocate memory for device list based on the size we got earlier */
    devices = (cl_device_id *)malloc(deviceListSize);
    if(devices == NULL) 
    {
		sampleCommon->error("Failed to allocate memory (devices).");
		return SDK_FAILURE;
	}

    /* Now, get the device list data */
    status = clGetContextInfo(context, 
                              CL_CONTEXT_DEVICES,
                              deviceListSize,
                              devices,
                              NULL);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetContextInfo failed."))
    {
        return SDK_FAILURE;
    }

    /* Create command queue */
    commandQueue = clCreateCommandQueue(context,
                                        devices[deviceId],
                                        0,
                                        &status);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateCommandQueue failed."))
    {
        return SDK_FAILURE;
    }

    /* Get Device specific Information */
    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_GROUP_SIZE,
                             sizeof(size_t),
                             (void*)&maxWorkGroupSize,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo"
                               "CL_DEVICE_MAX_WORK_GROUP_SIZE failed."))
        return SDK_FAILURE;


    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                             sizeof(cl_uint),
                             (void*)&maxDimensions,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo"
                               "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed."))
    {
        return SDK_FAILURE;
    }


    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));

    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_ITEM_SIZES,
                             sizeof(size_t) * maxDimensions,
                             (void*)maxWorkItemSizes,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo"
                               "CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
    {
        return SDK_FAILURE;
    }


    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_LOCAL_MEM_SIZE,
                             sizeof(cl_ulong),
                             (void*)&totalLocalMemory,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo"
                               "CL_DEVICE_LOCAL_MEM_SIZE failed."))
    {
        return SDK_FAILURE;
    }

    /**
     * Create and initialize memory objects
     */

    /* Create memory object for stock price */
    randBuffer = clCreateBuffer(context,
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                numSamples * sizeof(cl_float4),
                                randArray,
                                &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateBuffer failed. (randBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* Create memory object for output array */
    outBuffer = clCreateBuffer(context,
                               CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                               numSamples * sizeof(cl_float4),
                               output,
                               &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateBuffer failed. (outBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* create a CL program using the kernel source */
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        const char * binary = kernelFile.source().c_str();
        size_t binarySize = kernelFile.source().size();
        program = clCreateProgramWithBinary(context,
                                            1,
                                            &devices[deviceId], 
                                            (const size_t*)&binarySize,
                                            (const unsigned char**)&binary,
                                            NULL,
                                            &status);
    }
    else
    {
	// special case for packetized OpenCL (can not yet compile .cl directly)
	char vName[100];
	status = clGetPlatformInfo(platform,
			CL_PLATFORM_VENDOR,
			sizeof(vName),
			vName,
			NULL);
	const bool platformIsPacketizedOpenCL = !strcmp(vName, "Ralf Karrenberg, Saarland University");
	if (!strcmp(vName, "Intel(R) Corporation")) {
		vendorName = "intel";
	} else if (!strcmp(vName, "Advanced Micro Devices, Inc.")) {
		vendorName = "amd";
	} else if (platformIsPacketizedOpenCL) {
		vendorName = "pkt";
	} else {
		printf("ERROR: vendor not recognized: %s\n", vName);
	}

	kernelPath.append("BinomialOption_Kernels.cl");
	if(!kernelFile.open(kernelPath.c_str()))
	{
		std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
		return SDK_FAILURE;
	}

	const char * source = kernelFile.source().c_str();

        size_t sourceSize[] = {strlen(source)};
        program = clCreateProgramWithSource(context,
                                            1,
                                            &source,
                                            sourceSize,
                                            &status);
    }
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateProgramWithSource failed."))
    {
        return SDK_FAILURE;
    }

    std::string flagsStr = std::string("");

    // Get additional options
    if(isComplierFlagsSpecified())
    {
        streamsdk::SDKFile flagsFile;
        std::string flagsPath = sampleCommon->getPath();
        flagsPath.append(flags.c_str());
        if(!flagsFile.open(flagsPath.c_str()))
        {
            std::cout << "Failed to load flags file: " << flagsPath << std::endl;
            return SDK_FAILURE;
        }
        flagsFile.replaceNewlineWithSpaces();
        const char * flags = flagsFile.source().c_str();
        flagsStr.append(flags);
    }

    if(flagsStr.size() != 0)
        std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;

    

    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(program, 
							1, 
							&devices[deviceId], 
							flagsStr.c_str(), 
							NULL, 
							NULL);
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char * buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo(program,
                                              devices[deviceId],
                                              CL_PROGRAM_BUILD_LOG,
                                              buildLogSize,
                                              buildLog,
                                              &buildLogSize);
            if(!sampleCommon->checkVal(logStatus,
                                       CL_SUCCESS,
                                       "clGetProgramBuildInfo failed."))
            {
                  return SDK_FAILURE;
            }
            
            buildLog = (char*)malloc(buildLogSize);
            if(buildLog == NULL)
            {
                sampleCommon->error("Failed to allocate host memory. (buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo(program, 
                                              devices[deviceId], 
                                              CL_PROGRAM_BUILD_LOG, 
                                              buildLogSize, 
                                              buildLog, 
                                              NULL);
            if(!sampleCommon->checkVal(logStatus,
                                       CL_SUCCESS,
                                       "clGetProgramBuildInfo failed."))
            {
                  free(buildLog);
                  return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

          if(!sampleCommon->checkVal(status,
                                     CL_SUCCESS,
                                     "clBuildProgram failed."))
          {
                return SDK_FAILURE;
          }
    }
    /* get a kernel object handle for a kernel with the given name */
    kernel = clCreateKernel(program,
                            "binomial_options",
                            &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateKernel failed."))
    {
        return SDK_FAILURE;
    }

    /* Get kernel work group size */
    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[deviceId],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetKernelWorkGroupInfo failed."))
    {
        return SDK_FAILURE;
    }

    /* If group-size is gerater than maximum supported on kernel */
    if((size_t)(numSteps + 1) > kernelWorkGroupSize)
    {
        if(!quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << (numSteps + 1) << std::endl;
            std::cout << "Max Group Size supported on the kernel : " 
                      << kernelWorkGroupSize << std::endl;
            std::cout << "Using appropiate group-size." << std::endl;
            std::cout << "-------------------------------------------" << std::endl;
        }
        numSteps = (cl_int)kernelWorkGroupSize - 2;
    }

    return SDK_SUCCESS;
}
int
DeviceFission::setupCLRuntime()
{
    cl_int status = CL_SUCCESS;

    // Create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("DeviceFission_Kernels.cl");
    buildData.devices = Devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

	// Get allocate memory for subCmdQueue
	subCmdQueue = (cl_command_queue*)malloc(numSubDevices * sizeof(cl_command_queue));
	CHECK_ALLOCATION(subCmdQueue,"Failed to allocate memory. (subCmdQueue)");

	// Create command queue subCmdQueue
    for(cl_uint i = 0; i < numSubDevices; i++)
    {
        // Create command queue 
        subCmdQueue[i] = clCreateCommandQueue(rContext,
											  subDevices[i],
											  0,
											  &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (subCmdQueue)");
    }

	// Create command queue gpuCmdQueue
	gpuCmdQueue = clCreateCommandQueue(rContext,
									   gpuDevice,
									   0,
									   &status);
	CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (gpuCmdQueue)");
  
	// Create memory objects for input
    InBuf = clCreateBuffer(rContext,
                           CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                           length * sizeof(cl_int),
                           NULL,
                           &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (InBuf)");

	// Get allocate memory for sub devices output 
	subOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem));

	for(cl_uint i = 0; i < numSubDevices; i++)
	{
		// Create memory objects for sub devices output 
		subOutBuf[i] = clCreateBuffer(rContext,
								      CL_MEM_WRITE_ONLY,
								      half_length * sizeof(cl_int) ,
								      NULL,
								      &status);
		CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (subOutBuf)");
	}

	// Get allocate memory for GPU device output 
	gpuOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem));

	for(cl_uint i = 0; i < numSubDevices; i++)
	{
		// Create memory objects for GPU device output 
		gpuOutBuf[i] = clCreateBuffer(rContext,
									  CL_MEM_WRITE_ONLY,
									  half_length * sizeof(cl_int) ,
									  NULL,
									  &status);
		CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (gpuOutBuf)");
	}

    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    char * source = NULL;
    size_t sourceSize[] = {0};
    char * binary = NULL;
    size_t binarySize = 0;

    if(isLoadBinaryEnabled())
    {
		kernelPath += loadBinary;

		if(kernelFile.readBinaryFromFile(kernelPath.c_str()))
		{
			std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
			return SDK_FAILURE;
		}

        // Get binaries and binary sizes for CPU devices
        char** subBinaries = (char**)malloc(numSubDevices * sizeof(char*));
        if(subBinaries == NULL)
        {
            sampleCommon->error("Failed to allocate memory(subBinaries)");
            return SDK_FAILURE;
        }

        size_t* subBinariesSize = (size_t*)malloc(numSubDevices * sizeof(size_t*));
        if(subBinariesSize == NULL)
        {
            sampleCommon->error("Failed to allocate memory(subBinariesSize)");
            return SDK_FAILURE;
        }

        for(cl_uint i = 0; i < numSubDevices; ++i)
        {
            subBinaries[i] = (char*)kernelFile.source().c_str();
            subBinariesSize[i] = kernelFile.source().size();
        }

        subProgram = clCreateProgramWithBinary(rContext,
                                               numSubDevices,
                                               subDevices, 
                                               (const size_t *)subBinariesSize,
                                               (const unsigned char**)subBinaries,
                                               NULL,
                                               &status);
		CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(subProgram)");

		streamsdk::SDKFile kernelFileGPU;
		std::string kernelPathGPU = sampleCommon->getPath();
		if(!gpuAvailable)
		{
			loadBinaryGPU = loadBinary;
		}
		kernelPathGPU += loadBinaryGPU;

		if(loadBinaryGPU.length() == 0)
		{
			std::cout << "Failed to load GPU kernel file, please assign it by '--loadgpu'. "<< std::endl;
			return SDK_FAILURE;
		}

		if(kernelFileGPU.readBinaryFromFile(kernelPathGPU.c_str()))
		{
			std::cout << "Failed to load GPU kernel file : " << kernelPathGPU << std::endl;
			return SDK_FAILURE;
		}

		// Get binaries and binary sizes for GPU device
		char* subBinariesGPU;
		size_t subBinariesSizeGPU;;

		subBinariesGPU = (char*)kernelFileGPU.source().c_str();
		subBinariesSizeGPU = kernelFileGPU.source().size();

		gpuProgram = clCreateProgramWithBinary(rContext,
											   1,
											   &gpuDevice, 
											   &subBinariesSizeGPU,
											   (const unsigned char **)&subBinariesGPU,
											   NULL,
											   &status);
		CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(gpuProgram)");

        free(subBinaries);
        free(subBinariesSize);
        subBinariesSize = NULL;
        subBinaries = NULL;
    }
    else
    {
        kernelPath.append("DeviceFission_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))//bool
        {
            std::cout << "Failed to load kernel file: " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        const char * source = kernelFile.source().c_str();
        size_t sourceSize[] = {strlen(source)};

        // Create a CL program for sub-devices using the kernel source
        subProgram = clCreateProgramWithSource(rContext,
                                               1,
                                               (const char**)&source,
                                               sourceSize,
                                               &status);
        CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(subProgram)");

		 // Create a CL program for GPU device using the kernel source
		gpuProgram = clCreateProgramWithSource(rContext,
											   1,
											   (const char**)&source,
											   sourceSize,
											   &status);
		CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(gpuProgram)");
    }

	// Get build options
	const char *flags;
	streamsdk::SDKFile flagsFile;
	std::string flagsPath = sampleCommon->getPath();
	if(buildData.flagsFileName.size() != 0)
	{
		flagsPath.append(buildData.flagsFileName.c_str());
		if(!flagsFile.open(flagsPath.c_str()))
		{
			std::cout << "Failed to load flags file: " << flagsPath << std::endl;
			return SDK_FAILURE;
		}
		flagsFile.replaceNewlineWithSpaces();
		flags = flagsFile.source().c_str();
		if(strlen(flags) != 0)
			std::cout << "Build Options are : " << flags << std::endl;
	}
	else
	{
		flags = NULL;
	}
	
    // Create a cl program executable for all sub-devices 
    status = clBuildProgram(subProgram,
                            numSubDevices,
                            subDevices,
                            flags,
                            NULL,
                            NULL);
	CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(subProgram)");
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char * buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo(subProgram, 
                                              subDevices[0], 
                                              CL_PROGRAM_BUILD_LOG, 
                                              buildLogSize, 
                                              buildLog, 
                                              &buildLogSize);
            if(!sampleCommon->checkVal(logStatus,
                                       CL_SUCCESS,
                                       "clGetProgramBuildInfo failed."))
                return SDK_FAILURE;

            buildLog = (char*)malloc(buildLogSize);
            if(NULL == buildLog)
            {
                sampleCommon->error("Failed to allocate host memory.(buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo(subProgram, 
                                              subDevices[0], 
                                              CL_PROGRAM_BUILD_LOG, 
                                              buildLogSize, 
                                              buildLog, 
                                              NULL);
            if(!sampleCommon->checkVal(logStatus,
                                       CL_SUCCESS,
                                       "clGetProgramBuildInfo failed."))
            {
				free(buildLog);
				return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG(SUB-DEVICES)\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clBuildProgram failed. (SUB-DEVICES)"))
            return SDK_FAILURE;
	}


	// Create a cl program executable for GPU device
	status = clBuildProgram(gpuProgram,
							1,
							&gpuDevice,
							flags,
							NULL,
							NULL);
	CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(gpuProgram)");
	if(status != CL_SUCCESS)
	{
		if(status == CL_BUILD_PROGRAM_FAILURE)
		{
			cl_int logStatus;
			char * buildLog = NULL;
			size_t buildLogSize = 0;
			logStatus = clGetProgramBuildInfo(gpuProgram, 
											  gpuDevice, 
											  CL_PROGRAM_BUILD_LOG, 
											  buildLogSize, 
											  buildLog, 
											  &buildLogSize);
			if(!sampleCommon->checkVal(logStatus,
									   CL_SUCCESS,
									   "clGetProgramBuildInfo failed."))
				return SDK_FAILURE;

			buildLog = (char*)malloc(buildLogSize);
			if(NULL == buildLog)
			{
				sampleCommon->error("Failed to allocate host memory.(buildLog)");
				return SDK_FAILURE;
			}
			memset(buildLog, 0, buildLogSize);

			logStatus = clGetProgramBuildInfo(gpuProgram, 
											  gpuDevice, 
											  CL_PROGRAM_BUILD_LOG, 
											  buildLogSize, 
											  buildLog, 
											  NULL);
			if(!sampleCommon->checkVal(logStatus,
									   CL_SUCCESS,
									   "clGetProgramBuildInfo failed."))
			{
				free(buildLog);
				return SDK_FAILURE;
			}

			std::cout << " \n\t\t\tBUILD LOG(GPU-DEVICE)\n";
			std::cout << " ************************************************\n";
			std::cout << buildLog << std::endl;
			std::cout << " ************************************************\n";
			free(buildLog);
		}

		if(!sampleCommon->checkVal(status,
								   CL_SUCCESS,
								   "clBuildProgram failed. (GPU-DEVICE)"))
			return SDK_FAILURE;
    }

    // Get a kernel object handle for a kernel with the given name 
    subKernel[0] = clCreateKernel(subProgram,
                                  "Add",
                                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[0])");

	// Get a kernel object handle for a kernel with the given name 
	subKernel[1] = clCreateKernel(subProgram,
								  "Sub",
								  &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[1])");

	// Get a kernel object handle for a kernel with the given name 
	gpuKernel[0] = clCreateKernel(gpuProgram,
								  "Add",
								  &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(gpuKernel[0])");

	// Get a kernel object handle for a kernel with the given name 
	gpuKernel[1] = clCreateKernel(gpuProgram,
								  "Sub",
								  &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(gpuKernel[1])");

    return SDK_SUCCESS;
}
Esempio n. 26
0
void init_platform() {

  cl_uint num_platforms;
  cl_uint num_devices;

  // Get the platform ID
  status = clGetPlatformIDs(1, &platform, &num_platforms);
  if(status != CL_SUCCESS) {
    printf("Failed clGetPlatformIDs. %d", status);
    freeResources();
    exit (1);
  }
  if(num_platforms != 1) {
    printf("Found %d platforms!\n", num_platforms);
    freeResources();
    exit (1);
  }

  // Get the device ID
  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &num_devices);
  if(status != CL_SUCCESS) {
    printf("Failed clGetDeviceIDs. %d", status);
    freeResources();
    exit (1);
  }
  if(num_devices != 1) {
    printf("Found %d devices!\n", num_devices);
    freeResources();
    exit (1);
  }

  // Create a context
  context = clCreateContext(0, 1, &device, NULL, NULL, &status);
  if(status != CL_SUCCESS) {
    printf("Failed clCreateContext. %d", status);
    freeResources();
    exit (1);
  }
  queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
  if(status != CL_SUCCESS) {
    printf("Failed to create queue. Error %d", status);
    freeResources();
    exit (1);
  }
  
  // Create the program using binary already compiled offline using aoc (i.e. the .aocx file)
  FILE* fp = fopen(AOCX_FILE, "rb");
  if (fp == NULL) {
    printf("Failed to open %s file (fopen).\n", AOCX_FILE);
    exit(1);
  }
  fseek(fp, 0, SEEK_END);
  size_t binary_length = ftell(fp);
  unsigned char*binary = (unsigned char*) malloc(sizeof(unsigned char) * binary_length);
  assert(binary && "Malloc failed");
  rewind(fp);
  if (fread((void*)binary, binary_length, 1, fp) == 0) {
    printf("Failed to read from moving_average.aocx file (fread).\n");
    exit (1);
  }
  fclose(fp);
  cl_int kernel_status;
  program = clCreateProgramWithBinary(context, 1, &device, &binary_length, (const unsigned char**)&binary, &kernel_status, &status);
  if(status != CL_SUCCESS || kernel_status != CL_SUCCESS) {
    printf("Failed clCreateProgramWithBinary. %d", status);
    freeResources();
    exit (1);
  }

  // Build the program
  status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
  if(status != CL_SUCCESS) {
    printf("Failed clBuildProgram. %d", status);
    freeResources();
    exit (1);
  }
}
Esempio n. 27
0
void TexDecoder_OpenCL_Initialize()
{
	if(!g_Inited)
	{
		if(!OpenCL::Initialize())
			return;

		cl_int err = 1;
		size_t binary_size = 0;
		char *binary = NULL;
		char *header = NULL;
		size_t nDevices = 0;
		cl_device_id *devices = NULL;
		size_t *binary_sizes = NULL;
		char **binaries = NULL;
		std::string filename;
		char dolphin_rev[HEADER_SIZE];

		filename = File::GetUserPath(D_OPENCL_IDX) + "kernel.bin";
		snprintf(dolphin_rev, HEADER_SIZE, "%-31s", scm_rev_str);

		{
		File::IOFile input(filename, "rb");
		if (!input)
		{
			binary_size = 0;
		}
		else
		{
			binary_size = input.GetSize();
			header = new char[HEADER_SIZE];
			binary = new char[binary_size];
			input.ReadBytes(header, HEADER_SIZE);
			input.ReadBytes(binary, binary_size);
		}
		}

		if (binary_size > 0)
		{
			if (binary_size > HEADER_SIZE)
			{
				if (strncmp(header, dolphin_rev, HEADER_SIZE) == 0)
				{
					g_program = clCreateProgramWithBinary(OpenCL::GetContext(), 1, &OpenCL::device_id, &binary_size, (const unsigned char**)&binary, NULL, &err);
					if (err != CL_SUCCESS)
					{
						OpenCL::HandleCLError(err, "clCreateProgramWithBinary");
					}

					if (!err)
					{
						err = clBuildProgram(g_program, 1, &OpenCL::device_id, NULL, NULL, NULL);
						if (err != CL_SUCCESS)
						{
							OpenCL::HandleCLError(err, "clBuildProgram");
						}
					}
				}
			}
			delete [] header;
			delete [] binary;
		}

		// If an error occurred using the kernel binary, recompile the kernels
		if (err)
		{
			std::string code;
			filename = File::GetSysDirectory() + OPENCL_DIR DIR_SEP "TextureDecoder.cl";
			if (!File::ReadFileToString(filename.c_str(), code))
			{
				ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename.c_str());
				return;
			}

			g_program = OpenCL::CompileProgram(code.c_str());

			err = clGetProgramInfo(g_program, CL_PROGRAM_NUM_DEVICES, sizeof(nDevices), &nDevices, NULL);
			if (err != CL_SUCCESS)
			{
				OpenCL::HandleCLError(err, "clGetProgramInfo");
			}
			devices = (cl_device_id *)malloc( sizeof(cl_device_id) *nDevices);

			err = clGetProgramInfo(g_program, CL_PROGRAM_DEVICES, sizeof(cl_device_id)*nDevices, devices, NULL);
			if (err != CL_SUCCESS)
			{
				OpenCL::HandleCLError(err, "clGetProgramInfo");
			}

			binary_sizes = (size_t *)malloc(sizeof(size_t)*nDevices);
			err = clGetProgramInfo(g_program, CL_PROGRAM_BINARY_SIZES,	sizeof(size_t)*nDevices, binary_sizes, NULL);
			if (err != CL_SUCCESS)
			{
				OpenCL::HandleCLError(err, "clGetProgramInfo");
			}

			binaries = (char **)malloc(sizeof(char *)*nDevices);
			for (u32 i = 0; i < nDevices; ++i)
			{
				if (binary_sizes[i] != 0)
				{
					binaries[i] = (char *)malloc(HEADER_SIZE + binary_sizes[i]);
				}
				else
				{
					binaries[i] = NULL;
				}
			}
			err = clGetProgramInfo( g_program, CL_PROGRAM_BINARIES,	sizeof(char *)*nDevices, binaries, NULL );
			if (err != CL_SUCCESS)
			{
				OpenCL::HandleCLError(err, "clGetProgramInfo");
			}

			if (!err)
			{
				filename = File::GetUserPath(D_OPENCL_IDX) + "kernel.bin";

				File::IOFile output(filename, "wb");
				if (!output)
				{
					binary_size = 0;
				}
				else
				{
					// Supporting one OpenCL device for now
					output.WriteBytes(dolphin_rev, HEADER_SIZE);
					output.WriteBytes(binaries[0], binary_sizes[0]);
				}
			}
			for (u32 i = 0; i < nDevices; ++i)
			{
				if (binary_sizes[i] != 0)
				{
					free(binaries[i]);
				}
			}
			if (binaries != NULL)
				free(binaries);
			if (binary_sizes != NULL)
				free(binary_sizes);
			if (devices != NULL)
				free(devices);
		}

		for (int i = 0; i <= GX_TF_CMPR; ++i)
		{
			if (g_DecodeParametersNative[i].name)
				g_DecodeParametersNative[i].kernel =
				OpenCL::CompileKernel(g_program,
				g_DecodeParametersNative[i].name);

			if (g_DecodeParametersRGBA[i].name)
				g_DecodeParametersRGBA[i].kernel =
				OpenCL::CompileKernel(g_program,
				g_DecodeParametersRGBA[i].name);
		}

		// Allocating maximal Wii texture size in advance, so that we don't have to allocate/deallocate per texture
#ifndef DEBUG_OPENCL
		g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL);
		g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL);
#endif

		g_Inited = true;
	}
}
Esempio n. 28
0
cl_kernel
CLContext::generate_kernel_id (
    CLKernel *kernel,
    const uint8_t *source, size_t length,
    CLContext::KernelBuildType type)
{
    struct CLProgram {
        cl_program id;

        CLProgram ()
            : id (NULL)
        {}
        ~CLProgram () {
            if (id)
                clReleaseProgram (id);
        }
    };

    CLProgram program;
    cl_kernel kernel_id = NULL;
    cl_int error_code = CL_SUCCESS;
    cl_device_id device_id = _device->get_device_id ();
    const char * name = kernel->get_kernel_name ();

    XCAM_ASSERT (source && length);
    XCAM_ASSERT (name);

    switch (type) {
    case KERNEL_BUILD_SOURCE:
        program.id =
            clCreateProgramWithSource (
                _context_id, 1,
                (const char**)(&source), (const size_t *)&length,
                &error_code);
        break;
    case KERNEL_BUILD_BINARY:
        program.id =
            clCreateProgramWithBinary (
                _context_id, 1, &device_id,
                (const size_t *)&length, (const uint8_t**)(&source),
                NULL, &error_code);
        break;
    }

    XCAM_FAIL_RETURN (
        WARNING,
        error_code == CL_SUCCESS,
        NULL,
        "cl create program failed with error_cod:%d", error_code);
    XCAM_ASSERT (program.id);

    error_code = clBuildProgram (program.id, 1, &device_id, NULL, CLContext::program_pfn_notify, this);
    if (error_code != CL_SUCCESS) {
        char error_log [XCAM_CL_MAX_STR_SIZE];
        xcam_mem_clear (error_log);
        clGetProgramBuildInfo (program.id, device_id, CL_PROGRAM_BUILD_LOG, sizeof (error_log) - 1, error_log, NULL);
        XCAM_LOG_WARNING ("CL build program failed on %s, build log:%s", name, error_log);
        return NULL;
    }

    kernel_id = clCreateKernel (program.id, name, &error_code);
    XCAM_FAIL_RETURN (
        WARNING,
        error_code == CL_SUCCESS,
        NULL,
        "cl create kernel(%s) failed with error_cod:%d", name, error_code);

    return kernel_id;
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program binary */
        unsigned char *bin;
        size_t bin_len;
        cl_int bin_ret;
        
        /* Read program binary */
        if (argc == 2)
                bin = read_buffer((char *)argv[1], &bin_len);
        else
        {
                printf("error: No binary specified\n");
                exit(1);
        }
        
        /* Create a program */
        cl_program program;
        program = clCreateProgramWithBinary(context, 1, &device, &bin_len, (const unsigned char **)&bin, &bin_ret, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithBinary' failed\n");
                exit(1);
        }
        if (bin_ret != CL_SUCCESS)
        {
                printf("error: Invalid binary for device\n");
                exit(1);
        }
        printf("program=%p\n", program);
        
        /* Free binary */
        free(bin);
        
        printf("program binary loaded\n");
        printf("\n");

        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object*/
        cl_kernel kernel;
        kernel = clCreateKernel(program, "subtract_floatfloat", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_float *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_float));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_float)(2.0);
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create and init host side src buffer 1 */
        cl_float *src_1_host_buffer;
        src_1_host_buffer = malloc(num_elem * sizeof(cl_float));
        for (int i = 0; i < num_elem; i++)
                src_1_host_buffer[i] = (cl_float)(2.0);
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_float *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_float));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Free host side src buffer 1 */
        free(src_1_host_buffer);

        /* Free device side src buffer 1 */
        ret = clReleaseMemObject(src_1_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Esempio n. 30
0
int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem memobj = NULL;
    cl_program program = NULL;
    cl_kernel kernel = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    float mem[MEM_SIZE];

    FILE *fp;
    char fileName[] = "./kernel.clbin";
    size_t binary_size;
    char *binary_buf;
    cl_int binary_status;
    cl_int i;

    /* カーネルを含むオブジェクトファイルをロード */
    fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    binary_buf = (char *)malloc(MAX_BINARY_SIZE);
    binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp );
    fclose( fp );

    /* データを初期化 */
    for( i = 0; i < MEM_SIZE; i++ ) {
        mem[i] = i;
    }

    /* プラットフォーム・デバイスの情報の取得 */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* OpenCLコンテキストの作成 */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
  
    /* コマンドキューの作成 */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
  
    /* メモリバッファの作成 */
    memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret);

    /* メモリバッファにデータを転送 */
    ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

    /* 読み込んだバイナリからカーネルプログラムを作成 */
    program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, 
                                        (const unsigned char **)&binary_buf, &binary_status, &ret);
    
    /* OpenCLカーネルの作成 */
    kernel = clCreateKernel(program, "vecAdd", &ret);
    printf("err:%d\n", ret);

    /* OpenCLカーネル引数の設定 */
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);

    size_t global_work_size[3] = {MEM_SIZE, 0, 0};
    size_t local_work_size[3]  = {MEM_SIZE, 0, 0};

    /* OpenCLカーネルを実行 */
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);

    /* メモリバッファから結果を取得 */
    ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

    /* 結果の表示 */
    for(i=0; i<MEM_SIZE; i++) {
        printf("mem[%d] : %f\n", i, mem[i]);
    }
  
    /* 終了処理 */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    free(binary_buf);

    return 0;
}