int main(int argc, char *argv[]){
    cl_uint numPlatforms;
    cl_platform_id* clSelectedPlatformID = NULL;  
	int err;                            // error code returned from api calls
      
    int data[DATA_SIZE];              // original data set given to device
    int results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned
    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation
 
    cl_device_id device_id;              
    cl_context context;                 
    cl_command_queue commands;          
    cl_program program;                 
    cl_kernel kernel;                   
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    if(parseArgs(argc, argv)){
        return 0;
    }
    // Fill our data set with random int values
    unsigned int count = DATA_SIZE;
    

	////////////////////////////////////////////////////////////////////////////////
	 
	// Simple compute kernel which computes the collatz of an input array 
	//
	const char *KernelSource = fileToString("gpuFunctions.c");
    
	//get Platform
	clGetPlatformIDs(0, NULL, &numPlatforms);
	clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms);
    err = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL);

    //get Device
    err = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
    	printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }

    //create context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

     // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }
 
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
 
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }
 
    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "allToOne", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }


    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }
    timer t = createTimer();
    for(int i =0;i<rep;i++){
        initData(data);
        // Write our data set into the input array in device memory 
        //
        err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to write to source array!\n");
            exit(1);
        }
     
        // Set the arguments to our compute kernel
        //
        err = 0;
        err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
        err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to set kernel arguments! %d\n", err);
            exit(1);
        }
     
        // Get the maximum work group size for executing the kernel on the device
        //
        err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to retrieve kernel work group info! %d\n", err);
            exit(1);
        }
     
        // Execute the kernel over the entire range of our 1d input data set
        // using the maximum number of work group items for this device
        //
        
        global = count;
        err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
        if (err)
        {
            printf("Error: Failed to execute kernel!\n");
            return EXIT_FAILURE;
        }
     
        // Wait for the command commands to get serviced before reading back results
        //
        clFinish(commands);
     
        // Read back the results from the device to verify the output
        //
        err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );  
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read output array! %d\n", err);
            exit(1);
        }
    }
    double timeEnd = getTime(t);
    
    // Validate our results
    //
    correct = 0;
    for(int i = 0; i < arraySize; i++)
    {
        if(results[i] >= 0){
            correct++;
            if(i==0){
               printf("%d",results[i]);
            }else{
               printf(",%d",results[i]);
            }
        }
    }
    printf("\n");
    

    // Print a brief summary detailing the results
    printf("Computed '%d/%d' values to 1!\n", correct, arraySize);
    printf("TIME- %f\n",timeEnd);
    
    // Shutdown and cleanup
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
	return 0;
}
Example #2
0
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA,
                        const void *inMatrixB, int nrowB, int ncolB, bool transposeB,
                        double alpha, double beta, void *outMatrix, bool use_float)
{
    std::stringstream result;
    
    float *input_matrixA_f = (float *)inMatrixA;
    float *input_matrixB_f = (float *)inMatrixB;
    
    float *output_matrix_f = (float *)outMatrix;
    
    double *input_matrixA_d = (double *)inMatrixA;
    double *input_matrixB_d = (double *)inMatrixB;
    
    double *output_matrix_d = (double *)outMatrix;
    
    if (debug) {
        result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") <<
        ")" << std::endl << std::endl;
    }
    
    cl_int err = CL_SUCCESS;
    
    clblasStatus status = clblasSetup();
    if (status != CL_SUCCESS) {
        if (debug) {
            result << "clblasSetup: " << clblasErrorToString(status) << std::endl;
        }
        
        err = CL_INVALID_OPERATION;
    }
    
    // get first platform
    cl_platform_id platform = NULL;
    if (err == CL_SUCCESS) {
        err = clGetPlatformIDs(1, &platform, NULL);
    }
    
    if (debug && err == CL_SUCCESS) {
        result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl;
        result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl;
    }
    
    // context
    cl_context context = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateContext:" << std::endl;
        }
        
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    }
    
    // queue
    cl_command_queue queue = NULL;
    if (err == CL_SUCCESS) {
#ifdef CL_VERSION_2_0
        if (debug) {
            result << "clCreateCommandQueueWithProperties:" << std::endl;
        }
        
        queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
        
#else
        if (debug) {
            result << "clCreateCommandQueue:" << std::endl;
        }
        
        queue = clCreateCommandQueue(context, device, 0, &err);
#endif
    }
    
    // buffers
    cl_mem cl_input_matrixA = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixA:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(float), input_matrixA_f, &err);
            
        } else {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(double), input_matrixA_d, &err);
        }
    }
    
    cl_mem cl_input_matrixB = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixB:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(float), input_matrixB_f, &err);
            
        } else {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(double), input_matrixB_d, &err);
        }
    }
    
    int nrowC = transposeA ? ncolA : nrowA;
    int ncolC = transposeB ? nrowB : ncolB;
    cl_mem cl_output_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_output_vector:" << std::endl;
        }
        
        if (use_float) {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(float), output_matrix_f, &err);
            
        } else {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(double), output_matrix_d, &err);
        }
        
    }
    
    // ++++++++++++
    const int lda = nrowA;  // first dimension of A (rows), before any transpose
    const int ldb = nrowB;  // first dimension of B (rows), before any transpose
    const int ldc = nrowC;      // first dimension of C (rows)
    
    const int M = transposeA ? ncolA : nrowA;    // rows in A (after transpose, if any) and C
    const int N = transposeB ? nrowB : ncolB;    // cols in B (after transpose, if any) and C
    const int K = transposeA ? nrowA : ncolA;    // cols in A and rows in B (after transposes, if any)
    
    const clblasOrder order = clblasColumnMajor;
    const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans;
    const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans;
    
    cl_event event = NULL;
    
    if (err == CL_SUCCESS) {
        if (use_float) {
            if (debug) {
                result << "clblasSgemm:" << std::endl;
            }
            
            status = clblasSgemm(order, transA, transB, M, N, K,
                              alpha, cl_input_matrixA, 0, lda,
                              cl_input_matrixB, 0, ldb, beta,
                              cl_output_matrix, 0, ldc,
                              1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS && debug) {
                result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
            }
            
        } else {
            if (debug) {
                result << "clblasDgemm:" << std::endl;
            }
            
            status = clblasDgemm(order, transA, transB, M, N, K,
                                 alpha, cl_input_matrixA, 0, lda,
                                 cl_input_matrixB, 0, ldb, beta,
                                 cl_output_matrix, 0, ldc,
                                 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS) {
                if (debug) {
                    result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
                }
                
                err = status;
            }
        }
    }
    
    if (err == CL_SUCCESS) {
        /* Wait for calculations to be finished. */
        if (debug) {
            result << "clWaitForEvents:" << std::endl;
        }
        err = clWaitForEvents(1, &event);
    }
    
    // retrieve result
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "Retrieve result:" << std::endl;
        }
        
        if (use_float) {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL);
            
        } else {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL);
        }
    }
    
    std::string err_str = clErrorToString(err);
    result << std::endl << err_str << std::endl;
    
    // cleanup
    clReleaseMemObject(cl_output_matrix);
    cl_output_matrix = NULL;
    
    clReleaseMemObject(cl_input_matrixA);
    cl_input_matrixA = NULL;
    
    clReleaseMemObject(cl_input_matrixB);
    cl_input_matrixB = NULL;
    
    clReleaseCommandQueue(queue);
    queue = NULL;
    
    clReleaseContext(context);
    context = NULL;
    
    if (debug) {
        CERR << result.str();
    }
    
    ErrorStatus errorStatus = { err, status };
    
    return errorStatus;
}
Example #3
0
int main()
{
  typedef float       ScalarType;
  
  
  /////////////////////////////////////////////////////////////////////////////////////////////////////////
  //////////////////////// Part 1: Set up a custom context and perform a sample operation. ////////////////
  ////////////////////////         This is rather lengthy due to the OpenCL framework.     ////////////////
  ////////////////////////         The following does essentially the same as the          ////////////////
  ////////////////////////         'custom_kernels'-tutorial!                               ////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////////////////
  
  //manually set up a custom OpenCL context:
  std::vector<cl_device_id> device_id_array;
  
  //get all available devices
  viennacl::ocl::platform pf;
  std::cout << "Platform info: " << pf.info() << std::endl;
  std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT);
  std::cout << devices[0].name() << std::endl;
  std::cout << "Number of devices for custom context: " << devices.size() << std::endl;
  
  //set up context using all found devices:
  for (size_t i=0; i<devices.size(); ++i)
  {
      device_id_array.push_back(devices[i].id());
  }
     
  std::cout << "Creating context..." << std::endl;
  cl_int err;
  cl_context my_context = clCreateContext(0, device_id_array.size(), &(device_id_array[0]), NULL, NULL, &err);
  VIENNACL_ERR_CHECK(err);
   
  
  //create two Vectors:
  unsigned int vector_size = 10;
  std::vector<ScalarType> vec1(vector_size);
  std::vector<ScalarType> vec2(vector_size);
  std::vector<ScalarType> result(vector_size);
  
  //
  // fill the operands vec1 and vec2:
  //
  for (unsigned int i=0; i<vector_size; ++i)
  {
    vec1[i] = static_cast<ScalarType>(i);
    vec2[i] = static_cast<ScalarType>(vector_size-i);
  }
  
  //
  // create memory in OpenCL context:
  //
  cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
  VIENNACL_ERR_CHECK(err);
  cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
  VIENNACL_ERR_CHECK(err);
  cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);
  VIENNACL_ERR_CHECK(err);

  // 
  // create a command queue for each device:
  // 
  
  std::vector<cl_command_queue> queues(devices.size());
  for (size_t i=0; i<devices.size(); ++i)
  {
    queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);
    VIENNACL_ERR_CHECK(err);
  }
  
  // 
  // create and build a program in the context:
  // 
  size_t source_len = std::string(my_compute_program).length();
  cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
  err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);
  
/*            char buffer[1024];
            cl_build_status status;
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL);
            std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl;
            std::cout << "Log: " << buffer << std::endl;*/
  
  VIENNACL_ERR_CHECK(err);
  
  // 
  // create a kernel from the program:
  // 
  const char * kernel_name = "elementwise_prod";
  cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);
  VIENNACL_ERR_CHECK(err);

  
  //
  // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2;
  //
  err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
  VIENNACL_ERR_CHECK(err);
  size_t global_size = vector_size;
  size_t local_size = vector_size;
  err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);
  
  
  //
  // Read and output result:
  //
  err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);
  err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);

  std::cout << "vec1  : ";
  for (size_t i=0; i<vec1.size(); ++i)
    std::cout << vec1[i] << " ";
  std::cout << std::endl;

  std::cout << "vec2  : ";
  for (size_t i=0; i<vec2.size(); ++i)
    std::cout << vec2[i] << " ";
  std::cout << std::endl;

  std::cout << "result: ";
  for (size_t i=0; i<result.size(); ++i)
    std::cout << result[i] << " ";
  std::cout << std::endl;
  
  ////////////////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////// Part 2: Let ViennaCL use the already created context: //////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////////

  //Tell ViennaCL to use the previously created context.
  //This context is assigned an id '0' when using viennacl::ocl::switch_context().
  viennacl::ocl::setup_context(0, my_context, device_id_array, queues);
  viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero)
  
  //
  // Proof that ViennaCL really uses the new context:
  //
  std::cout << "Existing context: " << my_context << std::endl;
  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl;

  //
  // Wrap existing OpenCL objects into ViennaCL:
  //
  viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size);
  viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size);
  viennacl::vector<ScalarType> vcl_result(mem_result, vector_size);
  viennacl::scalar<ScalarType> vcl_s = 2.0;

  std::cout << "Standard vector operations within ViennaCL:" << std::endl;
  vcl_result = vcl_s * vcl_vec1 + vcl_vec2;
  
  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;
  
  //
  // We can also reuse the existing elementwise_prod kernel. 
  // Therefore, we first have to make the existing program known to ViennaCL
  // For more details on the three lines, see tutorial 'custom-kernels'
  //
  std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
  viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program");
  viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel("elementwise_prod");
  viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size())));  //Note that size_t might differ between host and device. Thus, a cast to cl_uint is necessary here.
  
  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;
  
  
  //
  // Since a linear piece of memory can be interpreted in several ways, 
  // we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/
  // The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products:
  //
  viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3);
  
  vcl_vec2.resize(3);   //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2);

  std::cout << "result of matrix-vector product: ";
  std::cout << vcl_result << std::endl;

  //
  //  That's it.
  //
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
  
  return 0;
}
Example #4
0
/*
 * pgstrom_collect_device_info
 *
 * It collects properties of all the OpenCL devices. It shall be called once
 * by the OpenCL management worker process, prior to any other backends.
 */
static List *
construct_opencl_device_info(int platform_index)
{
	cl_platform_id	platforms[32];
	cl_device_id	devices[MAX_NUM_DEVICES];
	cl_uint			n_platform;
	cl_uint			n_devices;
	cl_int			i, j, rc;
	long			score_max = -1;
	List		   *result = NIL;

	rc = clGetPlatformIDs(lengthof(platforms),
						  platforms,
						  &n_platform);
	if (rc != CL_SUCCESS)
		elog(ERROR, "clGetPlatformIDs failed (%s)", opencl_strerror(rc));

	for (i=0; i < n_platform; i++)
	{
		pgstrom_platform_info  *pl_info;
		pgstrom_device_info	   *dev_info;
		long		score = 0;
		List	   *temp = NIL;

		pl_info = collect_opencl_platform_info(platforms[i]);
		pl_info->pl_index = i;

		rc = clGetDeviceIDs(platforms[i],
							CL_DEVICE_TYPE_CPU |
							CL_DEVICE_TYPE_GPU |
							CL_DEVICE_TYPE_ACCELERATOR,
							lengthof(devices),
							devices,
							&n_devices);
		if (rc != CL_SUCCESS)
			elog(ERROR, "clGetDeviceIDs failed (%s)", opencl_strerror(rc));

		elog(LOG, "PG-Strom: [%d] OpenCL Platform: %s", i, pl_info->pl_name);

		for (j=0; j < n_devices; j++)
		{
			dev_info = collect_opencl_device_info(devices[j]);
			dev_info->pl_info = pl_info;
			dev_info->dev_index = j;

			elog(LOG, "PG-Strom:  + device %s (%uMHz x %uunits, %luMB)",
				 dev_info->dev_name,
				 dev_info->dev_max_clock_frequency,
				 dev_info->dev_max_compute_units,
				 dev_info->dev_global_mem_size >> 20);

			/* rough estimation about computing power */
			if ((dev_info->dev_type & CL_DEVICE_TYPE_GPU) != 0)
				score += 32 * (dev_info->dev_max_compute_units *
							   dev_info->dev_max_clock_frequency);
			else
				score += (dev_info->dev_max_compute_units *
						  dev_info->dev_max_clock_frequency);

			temp = lappend(temp, dev_info);
		}

		if (platform_index == i || (platform_index < 0 && score > score_max))
		{
			opencl_platform_id = platforms[i];
			opencl_num_devices = n_devices;
			for (j=0; j < n_devices; j++)
				opencl_devices[j] = devices[j];

			score_max = score;
			result = temp;
		}
	}

	/* show platform name if auto-selection */
	if (platform_index < 0 && result != NIL)
	{
		pgstrom_platform_info *pl_info
			= ((pgstrom_device_info *) linitial(result))->pl_info;
		elog(LOG, "PG-Strom: auto platform selection: %s", pl_info->pl_name);
	}

	if (result != NIL)
	{
		/*
		 * Create an OpenCL context
		 */
		opencl_context = clCreateContext(NULL,
										 opencl_num_devices,
										 opencl_devices,
										 NULL,
										 NULL,
										 &rc);
		if (rc != CL_SUCCESS)
			elog(ERROR, "clCreateContext failed: %s", opencl_strerror(rc));

		/*
		 * Create an OpenCL command queue for each device
		 */
		for (j=0; j < opencl_num_devices; j++)
		{
			opencl_cmdq[j] =
				clCreateCommandQueue(opencl_context,
									 opencl_devices[j],
									 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
									 CL_QUEUE_PROFILING_ENABLE,
									 &rc);
			if (rc != CL_SUCCESS)
				elog(ERROR, "clCreateCommandQueue failed: %s",
					 opencl_strerror(rc));
		}
	}
	return result;
}
Example #5
0
xcl_world xcl_world_single(cl_device_type device_type, char *target_vendor, char *target_device) {
	int err;
	xcl_world world;
	cl_uint num_platforms;

	err = clGetPlatformIDs(0, NULL, &num_platforms);
	if (err != CL_SUCCESS) {
		printf("Error: no platforms available or OpenCL install broken");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	cl_platform_id *platform_ids = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);

	if (platform_ids == NULL) {
		printf("Error: Out of Memory\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	err = clGetPlatformIDs(num_platforms, platform_ids, NULL);
	if (err != CL_SUCCESS) {
		printf("Error: Failed to find an OpenCL platform!\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	int i;
        char cl_platform_vendor[1001];
        //find target vendor if target_vendor is specified
        if (target_vendor != NULL) {
                for(i = 0; i < num_platforms; i++) {
                        err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL);
                        if (err != CL_SUCCESS) {
                                printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
                                printf("Test failed\n");
                                exit(EXIT_FAILURE);
                        }
                        if ((target_vendor != NULL) && (strcmp(cl_platform_vendor, target_vendor) == 0)) {
                                printf("INFO: Selected platform %d from %s\n", i, cl_platform_vendor);
                                world.platform_id = platform_ids[i];
                                break;
                        }
                }
        } else {
                for(i = 0; i < num_platforms; i++) {
                        err = clGetDeviceIDs(platform_ids[i], device_type,
                                             1, &world.device_id, NULL);
                        if (err == CL_SUCCESS) {
                                world.platform_id = platform_ids[i];
                                break;
                        }
                }            
        }
	free(platform_ids);
	if (i == num_platforms) {
		printf("Error: Failed to find a platform\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

        if (target_device != NULL) {
                //find target device
                cl_device_id devices[16];  // compute device id 
                cl_uint num_devices;
                char cl_device_name[100];
                err = clGetDeviceIDs(world.platform_id, CL_DEVICE_TYPE_ACCELERATOR,
                                     16, devices, &num_devices);
                if (err != CL_SUCCESS) {
                        printf("Error: Failed to create a device group!\n");
                        printf("Test failed\n");
                        exit(EXIT_FAILURE);
                }

                //iterate all devices to select the target device. 
                for (i=0; i<num_devices; i++) {
                        err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 100, cl_device_name, 0);
                        if (err != CL_SUCCESS) {
                                printf("Error: Failed to get device name for device %d!\n", i);
                                printf("Test failed\n");
                                exit(EXIT_FAILURE);
                        }
                        //printf("CL_DEVICE_NAME %s\n", cl_device_name);
                        if (strcmp(cl_device_name, target_device) == 0) {
                                world.device_id = devices[i];
                                printf("INFO: Selected %s as the target device\n", cl_device_name);
                                break;
                        }
                }

                if (i == num_devices) {
                        printf("Error: Failed to find target device %s\n", target_device);
                        printf("Test failed\n");
                        exit(EXIT_FAILURE);
                }
        }

	world.context = clCreateContext(0, 1, &world.device_id,
	                                NULL, NULL, &err);
	if (err != CL_SUCCESS) {
		printf("Error: Failed to create a compute context!\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	world.command_queue = clCreateCommandQueue(world.context,
	                                           world.device_id,
	                                           CL_QUEUE_PROFILING_ENABLE,
	                                           &err);
	if (err != CL_SUCCESS) {
		printf("Error: Failed to create a command queue!\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	return world;
}
Example #6
0
int main() {

   // Set the image rotation (in degrees)
   float theta = 3.14159/6;
   float cos_theta = cosf(theta);
   float sin_theta = sinf(theta);
   printf("theta = %f (cos theta = %f, sin theta = %f)\n", theta, cos_theta, 
      sin_theta);

   // Rows and columns in the input image
   int imageHeight;
   int imageWidth;

   const char* inputFile = "input.bmp";
   const char* outputFile = "output.bmp";

   // Homegrown function to read a BMP from file
   float* inputImage = readImage(inputFile, &imageWidth,
      &imageHeight);

   // Size of the input and output images on the host
   int dataSize = imageHeight*imageWidth*sizeof(float);

   // Output image on the host
   float* outputImage = NULL;
   outputImage = (float*)malloc(dataSize);

   // Set up the OpenCL environment
   cl_int status;

   // Discovery platform
   cl_platform_id platforms[2];
   cl_platform_id platform;
   status = clGetPlatformIDs(2, platforms, NULL);
   chk(status, "clGetPlatformIDs");
   platform = platforms[PLATFORM_TO_USE];

   // Discover device
   cl_device_id device;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
   chk(status, "clGetDeviceIDs");

   // Create context
   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
       (cl_context_properties)(platform), 0};
   cl_context context;
   context = clCreateContext(props, 1, &device, NULL, NULL, &status);
   chk(status, "clCreateContext");

   // Create command queue
   cl_command_queue queue;
   queue = clCreateCommandQueue(context, device, 0, &status);
   chk(status, "clCreateCommandQueue");

   // Create the input and output buffers
   cl_mem d_input;
   d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL,
       &status);
   chk(status, "clCreateBuffer");

   cl_mem d_output;
   d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL,
       &status);
   chk(status, "clCreateBuffer");

   // Copy the input image to the device
   status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize, 
         inputImage, 0, NULL, NULL);
   chk(status, "clEnqueueWriteBuffer");

   const char* source = readSource("rotation.cl");

   // Create a program object with source and build it
   cl_program program;
   program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
   chk(status, "clCreateProgramWithSource");
   status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   chk(status, "clBuildProgram");
   
   // Create the kernel object
   cl_kernel kernel;
   kernel = clCreateKernel(program, "img_rotate", &status);
   chk(status, "clCreateKernel");

   // Set the kernel arguments
   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_output);
   status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_input);
   status |= clSetKernelArg(kernel, 2, sizeof(int), &imageWidth);
   status |= clSetKernelArg(kernel, 3, sizeof(int), &imageHeight);
   status |= clSetKernelArg(kernel, 4, sizeof(float), &sin_theta);
   status |= clSetKernelArg(kernel, 5, sizeof(float), &cos_theta);
   chk(status, "clSetKernelArg");

   // Set the work item dimensions
   size_t globalSize[2] = {imageWidth, imageHeight};
   status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
      NULL, NULL);
   chk(status, "clEnqueueNDRange");

   // Read the image back to the host
   status = clEnqueueReadBuffer(queue, d_output, CL_TRUE, 0, dataSize, 
         outputImage, 0, NULL, NULL); 
   chk(status, "clEnqueueReadBuffer");

   // Write the output image to file
   storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);

   return 0;
}
Example #7
0
void runProgram(int N, char *fileName)
{
	printf("GPU Symmetrize()..."
		"\nSquareMatrix[%d][%d]\n", N, N);

	int i,j;

	// initialize input array
	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = j;	
		}
	}

	//  result
	float *Aout;
	Aout = (float*)malloc(sizeof(float)*N*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);
#endif

	int NumK = 1;
	int NumE = 2;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);



#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	//printf("binary size = %ld\n", binary_size);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL);
	OCL_CHECK(err);

	//puts("save binaries");

	// Print the binary out to the output file
	fh = fopen("kernel.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

	puts("done save binaries");

#endif


	kernel[0] = clCreateKernel(program, "kernel_a", &err);
	OCL_CHECK(err);

	// memory on device
	cl_mem A_d    	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem Aout_d   = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);


	// copy data to device
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , &event[0]); 
	OCL_CHECK(err);

	size_t localsize[2];
	size_t globalsize[2];

	localsize[0] = 16; 
	localsize[1] = 16;

	globalsize[0] = N;
	globalsize[1] = N;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL);
	OCL_CHECK(err);

	clFinish(queue);

	// read device data back to host
	clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]);

	err = clWaitForEvents(1,&event[1]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("Output");
	check_2d_f(Aout,N,N);
#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(Aout_d);	


	// // check
	// int flag = 1;
	// for(i=0;i<N;++i){
	// 	for(j=0;j<N;++j){
	// 		if(A[i*N+j] != At[j*N+i])		
	// 		{
	// 			flag  = 0;
	// 			break;
	// 		}
	// 	}
	// }
	// if( flag == 0 )
	// {
	// 	puts("Bugs! Check program.");
	// }else{
	// 	puts("Succeed!");	
	// }



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif



	free(A);
	free(Aout);

	return;
}
int main(int argc, char** argv)
{
    int          rank, size;         // MPI rank & size
    int          err;                // error code returned from OpenCL calls
    float        h_a[LENGTH];        // a vector
    float        h_b[LENGTH];        // b vector
    float        h_c[LENGTH];        // c vector (a+b) returned from the compute device (local per task)
    float        _h_c[LENGTH];       // c vector (a+b) returned from the compute device (global for master)
    unsigned int correct;            // number of correct results

    size_t global;                   // global domain size
    size_t local;                    // local  domain size

    cl_device_id     device_id;      // compute device id
    cl_context       context;        // compute context
    cl_command_queue commands;       // compute command queue
    cl_program       program;        // compute program
    cl_kernel        ko_vadd;        // compute kernel

    cl_mem d_a;                      // device memory used for the input  a vector
    cl_mem d_b;                      // device memory used for the input  b vector
    cl_mem d_c;                      // device memory used for the output c vector

    int mycount, i;

    err = MPI_Init (&argc, &argv);

    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Init failed!\n");
        exit (-1);
    }

    err = MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_rank failed!\n");
        exit (-1);
    }

    err = MPI_Comm_size (MPI_COMM_WORLD, &size);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_size failed\n");
        exit (-1);
    }

    if (LENGTH % size != 0)
    {
        printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH);
        exit (-1);
    }

    mycount = LENGTH / size;

    if (rank == 0)
    {
        for (i = 0; i < LENGTH; i++)
        {
            h_a[i] = rand() / (float)RAND_MAX;
            h_b[i] = rand() / (float)RAND_MAX;
            h_a[i] = i;
            h_b[i] = i*2;
        }
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_b\n");
            exit (-1);
        }
    }
    else
    {
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_b\n");
            exit (-1);
        }
    }

    // Set up platform
    cl_uint numPlatforms;

    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n");
        return EXIT_FAILURE;
    }

    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n");
        return EXIT_FAILURE;
    }

    // Secure a GPU
    for (i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }

    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    else
    {
        if (output_device_info (rank, device_id) != CL_SUCCESS)
            return EXIT_FAILURE;
    }

    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel from the program
    ko_vadd = clCreateKernel(program, "vadd", &err);
    if (!ko_vadd || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

    // Create the input (a, b) and output (c) arrays in device memory
    d_a = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_b = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_c = clCreateBuffer(context,  CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL);
    if (!d_a || !d_b || !d_c)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }

    // Write a and b vectors into compute device memory
    err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_a to source array!\n");
        exit(1);
    }

    err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_b to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    global = LENGTH;
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    // Wait for the commands to complete before reading back results
    clFinish(commands);

    // Read back the results from the compute device
    err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }

    err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Gather failed receiving h_c\n");
        exit (-1);
    }

    if (rank == 0)
    {
        // Test the results
        correct = 0;
        float tmp;

        for(i = 0; i < LENGTH; i++)
        {
            tmp = h_a[i] + h_b[i];     // assign element i of a+b to tmp
            tmp -= _h_c[i];             // compute deviation of expected and output result
            if(tmp*tmp < TOL*TOL)      // correct if square deviation is less than tolerance squared
                correct++;
            else
                printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]);
        }

        // summarize results
        printf("C = A+B:  %d out of %d results were correct.\n", correct, LENGTH);
    }

    // cleanup then shutdown
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(ko_vadd);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    err = MPI_Finalize ();
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Finalize failed!\n");
        exit (-1);
    }

    return 0;
}
// Main program
//*****************************************************************************
int main(int argc, char** argv) 
{
	// Locals used with command line args
    int p = 256;            // workgroup X dimension
    int q = 1;              // workgroup Y dimension

	pArgc = &argc;
	pArgv = argv;

    shrQAStart(argc, argv);

    // latch the executable path for other funcs to use
    cExecutablePath = argv[0];

    // start logs and show command line help
	shrSetLogFileName ("oclNbody.txt");
    shrLog("%s Starting...\n\n", cExecutablePath);
    shrLog("Command line switches:\n");
	shrLog("  --qatest\t\tCheck correctness of GPU execution and measure performance)\n");
	shrLog("  --noprompt\t\tQuit simulation automatically after a brief period\n");
    shrLog("  --n=<numbodies>\tSpecify # of bodies to simulate (default = %d)\n", numBodies);
	shrLog("  --double\t\tUse double precision floating point values for simulation\n");
	shrLog("  --p=<workgroup X dim>\tSpecify X dimension of workgroup (default = %d)\n", p);
	shrLog("  --q=<workgroup Y dim>\tSpecify Y dimension of workgroup (default = %d)\n\n", q);

	// Get command line arguments if there are any and set vars accordingly
    if (argc > 0)
    {
        shrGetCmdLineArgumenti(argc, (const char**)argv, "p", &p);
        shrGetCmdLineArgumenti(argc, (const char**)argv, "q", &q);
        shrGetCmdLineArgumenti(argc, (const char**)argv, "n", &numBodies);
	    bDouble = (shrTRUE == shrCheckCmdLineFlag(argc, (const char**)argv, "double"));
        bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
        bQATest = shrCheckCmdLineFlag(argc, (const char**)argv, "qatest");
    }

    //Get the NVIDIA platform
    cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clGetPlatformID...\n\n"); 
	
	if (bDouble)
	{
		shrLog("Double precision execution...\n\n");
	}
	else
	{
		shrLog("Single precision execution...\n\n");
	}

	flopsPerInteraction = bDouble ? 30 : 20; 
    
	//Get all the devices
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Set target device and Query number of compute units on uiTargetDevice
    shrLog("  # of Devices Available = %u\n", uiNumDevices); 
    if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) 
    {
        uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
    }
    shrLog("  Using Device %u, ", uiTargetDevice); 
    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);  
    cl_uint uiNumComputeUnits;        
    clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
    shrLog("  # of Compute Units = %u\n", uiNumComputeUnits); 

    //Create the context
    shrLog("clCreateContext...\n"); 
    cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create a command-queue 
    shrLog("clCreateCommandQueue...\n\n"); 
    cqCommandQueue = clCreateCommandQueue(cxContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Log and config for number of bodies
    shrLog("Number of Bodies = %d\n", numBodies); 
    switch (numBodies)
    {
        case 1024:
            activeParams.m_clusterScale = 1.52f;
            activeParams.m_velocityScale = 2.f;
            break;
        case 2048:
            activeParams.m_clusterScale = 1.56f;
            activeParams.m_velocityScale = 2.64f;
            break;
        case 4096:
            activeParams.m_clusterScale = 1.68f;
            activeParams.m_velocityScale = 2.98f;
            break;
        case 7680:
        case 8192:
            activeParams.m_clusterScale = 1.98f;
            activeParams.m_velocityScale = 2.9f;
            break;
        default:
        case 15360:
        case 16384:
            activeParams.m_clusterScale = 1.54f;
            activeParams.m_velocityScale = 8.f;
            break;
        case 30720:
        case 32768:
            activeParams.m_clusterScale = 1.44f;
            activeParams.m_velocityScale = 11.f;
            break;
    }

    if ((q * p) > 256)
    {
        p = 256 / q;
        shrLog("Setting p=%d to maintain %d threads per block\n", p, 256);
    }

    if ((q == 1) && (numBodies < p))
    {
        p = numBodies;
        shrLog("Setting p=%d because # of bodies < p\n", p);
    }
    shrLog("Workgroup Dims = (%d x %d)\n\n", p, q); 

    // Initialize OpenGL items if using GL 
    if (bQATest == shrFALSE)
    {
	    shrLog("Calling InitGL...\n"); 
	    InitGL(&argc, argv);
    }
    else 
    {
	    shrLog("Skipping InitGL...\n"); 
    }
	
    // CL/GL interop disabled
    bUsePBO = (false && (bQATest == shrFALSE));
    InitNbody(cdDevices[uiTargetDevice], cxContext, cqCommandQueue, numBodies, p, q, bUsePBO, bDouble);
    ResetSim(nbody, numBodies, NBODY_CONFIG_SHELL, bUsePBO);

    // init timers
    shrDeltaT(DEMOTIME); // timer 0 is for timing demo periods
    shrDeltaT(FUNCTIME); // timer 1 is for logging function delta t's
    shrDeltaT(FPSTIME);  // timer 2 is for fps measurement   

    // Standard simulation
    if (bQATest == shrFALSE)
    {
        shrLog("Running standard oclNbody simulation...\n\n"); 
        glutDisplayFunc(DisplayGL);
        glutReshapeFunc(ReshapeGL);
        glutMouseFunc(MouseGL);
        glutMotionFunc(MotionGL);
        glutKeyboardFunc(KeyboardGL);
        glutSpecialFunc(SpecialGL);
        glutIdleFunc(IdleGL);
        glutMainLoop();
    }


    // Compare to host, profile and write out file for regression analysis
    if (bQATest == shrTRUE) {
	    bool bTestResults = false;
        shrLog("Running oclNbody Results Comparison...\n\n"); 
        bTestResults = CompareResults(numBodies);

        shrLog("Profiling oclNbody...\n\n"); 
        RunProfiling(100, (unsigned int)(p * q));  // 100 iterations

		shrQAFinish(argc, (const char **)argv, bTestResults ? QA_PASSED : QA_FAILED);
    } else {
        // Cleanup/exit 
	    bNoPrompt = shrTRUE;
        shrQAFinish2(false, *pArgc, (const char **)pArgv, QA_PASSED);
    }
    Cleanup(EXIT_SUCCESS);
}
Example #10
0
int main()
{
	cl_device_id device = new_device();
	cl_context context;
	cl_command_queue queue;
	cl_program program;
	cl_kernel kernel;
	cl_int i, j, err;

	float matrix_1[80], matrix_2[80], matrix_3[80];
	const size_t buffer_origin[3] = { 5 * sizeof(float), 3, 0 };
	const size_t host_origin[3] = { 1 * sizeof(float), 1, 0 };
	const size_t region[3] = { 4 * sizeof(float), 4, 1 };
	cl_mem matrix_buffer_1, matrix_buffer_2, matrix_buffer_3;

	for (i = 0; i < 80; i++)
	{
		matrix_1[i] = i * 1.0f;
		matrix_2[i] = 3.0;
		matrix_3[i] = 0;
	}

	context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
	if (err < 0)
	{
		perror("Couldn't create a context\n");
		exit(1);
	}
	program = build_program(context, device, FILE_NAME);
	kernel = clCreateKernel(program, "add", &err);
	if (err < 0) {
		perror("Couldn't create a kernel\n");
		exit(1);
	}
	
	matrix_buffer_1 = clCreateBuffer(context, CL_MEM_READ_WRITE |
		CL_MEM_COPY_HOST_PTR, sizeof(matrix_1), matrix_1, &err);
	if (err < 0) {
		perror("Couldn't create a buffer\n");
		exit(1);
	}
	matrix_buffer_2 = clCreateBuffer(context, CL_MEM_READ_WRITE |
		CL_MEM_COPY_HOST_PTR, sizeof(matrix_2), matrix_2, &err);
	if (err < 0) {
		perror("Couldn't create a buffer\n");
		exit(1);
	}
	matrix_buffer_3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(matrix_3), matrix_3, &err);
	if (err < 0) {
		perror("Couldn't create a buffer\n");
		exit(1);
	}
	int row = 8;
	int col = 10;
	err = clSetKernelArg(kernel, 0, sizeof(int), &row);
	err = clSetKernelArg(kernel, 1, sizeof(int), &col);
	err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &matrix_buffer_1);
	err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &matrix_buffer_2);
	err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &matrix_buffer_3);

	queue = clCreateCommandQueue(context, device, 0, &err);
	if (err < 0) {
		perror("Couldn't create a command queue\n");
		exit(1);
	}
	err = clEnqueueTask(queue, kernel, 0, NULL, NULL);
	if (err < 0) {
		perror("Couldn't enque task\n");
		exit(1);
	}
	err = clEnqueueReadBuffer(queue, matrix_buffer_3, CL_TRUE, 0,
		sizeof(matrix_3), &matrix_3, 0, NULL, NULL);
	for (i = 0; i < 8; i++) {
		for (j = 0; j < 10; j++) {
			printf("%6.1f ", matrix_3[j + i * 10]);
		}
		printf("\n");
	}
	clReleaseMemObject(matrix_buffer_1);
	clReleaseMemObject(matrix_buffer_2);
	clReleaseMemObject(matrix_buffer_3);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(queue);
	clReleaseContext(context);
	return 0;
}
void call_kernel(float *data,unsigned int count,char * cl_name,float *results) {

    FILE* programHandle;
    size_t programSize, KernelSourceSize;
    char *programBuffer, *KernelSource;

    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation

    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel

    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array

	int err;
    int gpu = 1;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    commands = clCreateCommandQueue(context, device_id, 0, &err);

	//----------------------------------------------------------------------------
    // get size of kernel source
    programHandle = fopen(cl_name, "r");
    fseek(programHandle, 0, SEEK_END);
    programSize = ftell(programHandle);
    rewind(programHandle);

    programBuffer = (char*) malloc(programSize + 1);
    programBuffer[programSize] = '\0';
    fread(programBuffer, sizeof(char), programSize, programHandle);
    fclose(programHandle);

    // create program from buffer
    program = clCreateProgramWithSource(context,1,(const char**) &programBuffer,&programSize, NULL);
    free(programBuffer);

    // read kernel source back in from program to check
    clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &KernelSourceSize);
    KernelSource = (char*) malloc(KernelSourceSize);
    clGetProgramInfo(program, CL_PROGRAM_SOURCE, KernelSourceSize, KernelSource, NULL);

    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    kernel = clCreateKernel(program, "square", &err);
	//----------------------------------------------------------------------------

    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);

    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);

    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    clFinish(commands);
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );

    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    printf("nKernel source:\n\n %s \n", KernelSource);
    free(KernelSource);
}
Example #12
0
int main() {

   /* Host/device data structures */
   cl_platform_id platform;
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_int err;

   /* Program/kernel data structures */
   cl_program program;
   FILE *program_handle;
   char *program_buffer, *program_log;
   size_t program_size, log_size;
   cl_kernel kernel;
   size_t offset = 0;
   size_t global_size, local_size;

   /* Data and buffers */
   char pattern[16] = "thatwithhavefrom";
   FILE *text_handle;
   char *text;
   size_t text_size;
   int chars_per_item;
   int result[4] = {0, 0, 0, 0};
   cl_mem text_buffer, result_buffer;
   
   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn't identify a platform");
      exit(1);
   } 

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   if(err < 0) {
      perror("Couldn't access any devices");
      exit(1);   
   }

   /* Determine global size and local size */
   clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, 		
      sizeof(global_size), &global_size, NULL);	
   clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, 		
      sizeof(local_size), &local_size, NULL);
   global_size *= local_size;

   /* Create a context */
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Read program file and place content into buffer */
   program_handle = fopen(PROGRAM_FILE, "r");
   if(program_handle == NULL) {
      perror("Couldn't find the program file");
      exit(1);
   }
   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_buffer = (char*)calloc(program_size+1, sizeof(char));
   fread(program_buffer, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   /* Read text file and place content into buffer */
   text_handle = fopen(TEXT_FILE, "r");
   if(text_handle == NULL) {
      perror("Couldn't find the text file");
      exit(1);
   }
   fseek(text_handle, 0, SEEK_END);
   text_size = ftell(text_handle)-1;
   rewind(text_handle);
   text = (char*)calloc(text_size, sizeof(char));
   fread(text, sizeof(char), text_size, text_handle);
   fclose(text_handle);
   chars_per_item = text_size / global_size + 1;

   /* Create program from file */
   program = clCreateProgramWithSource(context, 1, 
      (const char**)&program_buffer, &program_size, &err);
   if(err < 0) {
      perror("Couldn't create the program");
      exit(1);
   }
   free(program_buffer);

   /* Build program */
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if(err < 0) {
            
      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &log_size);
      program_log = (char*) calloc(log_size+1, sizeof(char));
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            log_size+1, program_log, NULL);
      printf("%s\n", program_log);
      free(program_log);
      exit(1);
   }

   /* Create a kernel */
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);
   };

   /* Create buffers to hold the text characters and count */
   text_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
         CL_MEM_COPY_HOST_PTR, text_size, text, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };
   result_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
         CL_MEM_COPY_HOST_PTR, sizeof(result), result, NULL);

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(pattern), pattern);
   err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &text_buffer);
   err |= clSetKernelArg(kernel, 2, sizeof(chars_per_item), &chars_per_item);
   err |= clSetKernelArg(kernel, 3, 4 * sizeof(int), NULL);
   err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &result_buffer);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   err = clEnqueueNDRangeKernel(queue, kernel, 1, &offset, &global_size, 
         &local_size, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      printf("Error code: %d\n", err);
      exit(1);   
   }

   /* Read and print the result */
   err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, 
      sizeof(result), &result, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }

   printf("\nResults: \n");
   printf("Number of occurrences of 'that': %d\n", result[0]);
   printf("Number of occurrences of 'with': %d\n", result[1]);
   printf("Number of occurrences of 'have': %d\n", result[2]);
   printf("Number of occurrences of 'from': %d\n", result[3]);

   /* Deallocate resources */
   clReleaseMemObject(result_buffer);
   clReleaseMemObject(text_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
int main()
{
        //Control Variables
        bool showStartInput=false;// Setting it to true shows the original Input
        bool showFftOutput=false;// Shows the output after the FFT but before the Reshuffle
        bool showReshuffleOutput=false;// Shows the output after the reshuffle
        bool showFinalResult=false; // Shows final result after cross-correlation
        bool showGemmInput=false; // Shows output after the reshuffle but before the matrix multiplication
	bool showReformatOutputAfterReshuffle=false; // Shows output after it has been reformatted after the reshuffling

	//openCL State
        cl_platform_id platform_id=NULL;
	cl_device_id device_id=NULL;
        cl_context context=NULL;
        cl_command_queue queue=NULL;
        cl_program program=NULL;
        cl_kernel kernel=NULL;
        cl_uint ret_num_devices;
        cl_uint ret_num_platforms;
        cl_int ret=0; // Stores the error values retuned by many functions
	cl_event event = NULL;
        cl_event events[10];
	cl_kernel clKernel;

	//FFT state
	clAmdFftPlanHandle plHandle;
        clAmdFftResultLocation place = CLFFT_OUTOFPLACE; //Alternative CLFFT_INPLACE
        clAmdFftLayout inLayout  = CLFFT_COMPLEX_INTERLEAVED;
        clAmdFftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED;
        clAmdFftDim dim = CLFFT_1D;
	size_t clStrides[3]={0,0,0};
        size_t clLengths[3];
	clLengths[0]=(MEM_SIZE/2);//Length of first dimension of fft
	clLengths[1]=1;//length of second dimension of fft
	clLengths[2]=1;
	clStrides[ 0 ] = 1;
        clStrides[ 1 ] = clStrides[ 0 ] * clLengths[ 0 ];
        clStrides[ 2 ] = clStrides[ 1 ] * clLengths[ 1 ];
        clStrides[ 3 ] = clStrides[ 2 ] * clLengths[ 2 ];
	size_t batchSize=CHANSIZE;//number of discreet fft's to be calculated simultaneously


	//Initialise openCL
	OPENCL_V_THROW(clGetPlatformIDs(1, &platform_id, &ret_num_platforms),"clGetPlatformIDs Failed");
        OPENCL_V_THROW(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&ret_num_devices),"clGetDeviceIDs Failed");
        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
        OPENCL_V_THROW(ret, "Creating Context failed" );
        queue = clCreateCommandQueue(context, device_id, 0, &ret);
        OPENCL_V_THROW(ret, "Creating command queue failed" );


//===========Initialise the host buffers======================================

/*
 * The functions sgenerate2darray(), screate2darray() and sgenerate2darrayout() are defined and declared in definition.h
 */

	float** src_a_h=sgenerate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the original input

	float** answer=screate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the answer after the reshuffling

        float** corr_h=sgenerate2darrayout(NO_INPUTS,CHANSIZE << 1,CHANNELNO);// To be used to store the final answer

        if(showStartInput){
            cout << "Initial Input Buffer" << "\n";
            for(int j=0;j<NO_INPUTS;j++){
                for(int i=0;i<MEM_SIZE;i++){
                        cout << src_a_h[j][i] << " ";
                }cout << "\n";
            }printf("\n");
        }


//===================================================================
        //Calculation of facs for reshuffling
	complex <float>* facs_h=(complex <float>*) malloc(sizeof(complex <float>)*(MEM_SIZE/2));
	complex<float> I=1.0i;
        complex <float> xx=2.0*PI;
        for(int i=0;i<MEM_SIZE/2;i++){
                facs_h[i]=(1.0*i)/(1.0*MEM_SIZE);
                facs_h[i]=exp(xx*(-I*facs_h[i]));
        }
//===================================================================
	//Initialise GPU memory buffers
	size_t sizeofgpumem=NO_INPUTS*MEM_SIZE*sizeof(float);
	size_t sizeoffacsmem=MEM_SIZE*sizeof(float);
	cl_mem clMemBuffersIn = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sizeofgpumem,src_a_h[0],&ret);
	OPENCL_V_THROW( ret, "Creating clMemBuffersIn Buffer failed" );

	cl_mem clMemBuffersOut = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeofgpumem,NULL,&ret);
	OPENCL_V_THROW (ret, "Creating fft output Buffer failed");

	cl_mem facs = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeoffacsmem,facs_h,&ret);
	OPENCL_V_THROW (ret, "Creating facs Buffer failed");

	//===========================Starting the fft=============================//
	clAmdFftSetupData setupData;
	OPENCL_V_THROW( clAmdFftInitSetupData( &setupData ),"clAmdFftInitSetupData failed" );
        OPENCL_V_THROW( clAmdFftSetup( &setupData ), "clAmdFftSetup failed" );
	OPENCL_V_THROW( clAmdFftCreateDefaultPlan( &plHandle, context, dim, clLengths ), "clAmdFftCreateDefaultPlan failed" );
	OPENCL_V_THROW (clAmdFftSetPlanBatchSize (plHandle, batchSize),"Setting BatchSize Failed");
        OPENCL_V_THROW (clAmdFftSetResultLocation( plHandle, place ), "clAmdFftSetResultLocation failed" );
	OPENCL_V_THROW (clAmdFftSetPlanInStride  ( plHandle, dim, clStrides ), "clAmdFftSetPlanInStride failed" );
        OPENCL_V_THROW (clAmdFftSetPlanOutStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanOutStride failed" );
        OPENCL_V_THROW (clAmdFftSetPlanDistance  ( plHandle, clStrides[ dim ], clStrides[ dim ]), "clAmdFftSetPlanDistance failed" );

	OPENCL_V_THROW( clAmdFftBakePlan( plHandle, 1, &queue, NULL, NULL ), "clAmdFftBakePlan failed" );

	size_t tempbuffersize=0;
        OPENCL_V_THROW( clAmdFftGetTmpBufSize(plHandle, &tempbuffersize ), "clAmdFftGetTmpBufSize failed" );

	//allocate the intermediate buffer
        cl_mem clMedBuffer=NULL;
        if (tempbuffersize)
        {
                cl_int medstatus;
                clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,tempbuffersize, 0, &medstatus);
                OPENCL_V_THROW( medstatus, "Creating fft intermediate Buffer failed" );
        }
	if (( place == CLFFT_INPLACE )&& ( inLayout != outLayout )) {
                switch( inLayout )
                {
                case CLFFT_COMPLEX_INTERLEAVED:
                        {
                                assert (CLFFT_COMPLEX_PLANAR == outLayout);
                                throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
                                break;
                        }
                case CLFFT_COMPLEX_PLANAR:
                        {
                                assert (CLFFT_COMPLEX_INTERLEAVED == outLayout);
                                throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
                                break;
                        }
                }
        }
	cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL :  &clMemBuffersOut;

	//========Timimg fft============//

	double time_fft_start=omp_get_wtime();

	for(int i=0;i<ITER_FFT;i++){
		OPENCL_V_THROW( clAmdFftEnqueueTransform( plHandle, CLFFT_FORWARD, 1,&queue,0,NULL,&event,&clMemBuffersIn,BuffersOut,clMedBuffer ),"clAmdFftEnqueueTransform failed" );
	}
	ret=clWaitForEvents(1,&event);

        double time_fft_end=omp_get_wtime();

	//Cleaning up fft
        OPENCL_V_THROW( clAmdFftDestroyPlan( &plHandle ), "clAmdFftDestroyPlan failed" );
        OPENCL_V_THROW( clAmdFftTeardown( ), "clAmdFftTeardown failed" );

        //displaying results
        if(showFftOutput){
            OPENCL_V_THROW( clEnqueueReadBuffer( queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer [0], 0, NULL, NULL ),"Reading the result buffer failed" );

            cout << "**FFT Output**" << endl;
            for(int j=0;j<NO_INPUTS;j++){
                    for(int i=0;i<MEM_SIZE;i++){
                            cout << answer[j][i] << " ";
                    } printf("\n");
            }printf("\n");
        }
	//==================End of FFT=============================================//

	//==================Start the Reshuffling==================================//
	FILE *fp;
	char fileName[]="./reshuffle.cl";
	char* source_str=NULL;
	size_t source_size;

	//Load the source code containing the kernel/
 	fp = fopen(fileName, "r");
 	if (!fp) {
 		fprintf(stderr, "Failed to load reshuffle kernel.Â¥n");
	 	exit(1);
 	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	//Preparation for building the Kernel
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
        OPENCL_V_THROW( ret, "Creating program with source failed for Reshuffle" );
	OPENCL_V_THROW( clBuildProgram(program, 1, &device_id, NULL, NULL, NULL),"Build Program Failed for Reshuffle");
	kernel = clCreateKernel(program, "reshuffle", &ret);
	OPENCL_V_THROW( ret, "Creating kernel failed for Reshuffle" );

	//Set kernel parameters
	const int num=NO_INPUTS*MEM_SIZE;
	const int block=MEM_SIZE;
	OPENCL_V_THROW(clSetKernelArg(kernel, 0, sizeof(cl_mem), (float *)&clMemBuffersIn),"Passing argument 0 of reshuffle failed");
 	OPENCL_V_THROW(clSetKernelArg(kernel, 1, sizeof(cl_mem), (float *)&facs),"Passing arg 1 of reshuffle failed");
        OPENCL_V_THROW(clSetKernelArg(kernel, 2, sizeof(cl_mem), (float *)&clMemBuffersOut),"Passing arg2 of reshuffle failed");
 	OPENCL_V_THROW(clSetKernelArg(kernel, 3, sizeof(int), (int *)&num),"Passing arg3 of reshuffle failed");
        OPENCL_V_THROW(clSetKernelArg(kernel, 4, sizeof(int), (int *)&block),"Passing arg4 of reshuffle failed");


	// Execute OpenCL Kernel //
	const size_t local_ws=NO_INPUTS*MEM_SIZE;
	const size_t global_ws=min(NO_THREAD_PER_BLOCK,MEM_SIZE);//ceil(MEM_SIZE/local_ws);

	//===========timing the reshuffle===============//
	double time_reshuffle_start=omp_get_wtime();
	for(int i=0;i<ITER_FFT;i++){
		OPENCL_V_THROW(clEnqueueNDRangeKernel(queue,kernel, 1, NULL,&local_ws,&global_ws, 0, NULL, NULL),"Reshuffle Kernel execution failed");
	}
	double time_reshuffle_end=omp_get_wtime();

	//Read back data
	OPENCL_V_THROW(clEnqueueReadBuffer(queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer[0], 0, NULL, NULL),"Reading back reshuffled data failed");

	//====================Finish the reshuffling================================//

        if(showReshuffleOutput){
                cout << "Output after reshuffling" << endl;
                for(int j=0;j<NO_INPUTS;j++){
                        for(int i=0;i<MEM_SIZE;i++){
                                cout << answer[j][i] << " ";
                        } printf("\n");
                }printf("\n");
        }
        //=================Reformatting the input given to the matrix multiply===================================//
        float** answer_final=screate2darray(NO_INPUTS*2,MEM_SIZE/2);
        for(int i=0;i<NO_INPUTS;i++){
            for(int j=0;j<MEM_SIZE;j++){
                if(j&1)
                        answer_final[(i<<1)+1][j >> 1]=answer[i][j];
                else
                        answer_final[(i<<1)][j >> 1]=answer[i][j];
            }
        }
Example #14
0
MainContext::MainContext(const cl_device_type_t dev_type)
{
    if (clGetPlatformIDs(0, NULL, &plat_count) == CL_SUCCESS)
    {
        plat = new cl_platform_id_t[plat_count];

        if (clGetPlatformIDs(plat_count, plat, NULL) == CL_SUCCESS)
        {
            dev_count = new cl_uint_t[plat_count];
            dev = new cl_device_id_t*[plat_count];
            context = new cl_context_t*[plat_count];
            cmd = new cl_command_queue_t*[plat_count];

            for (cl_uint_t p = 0; p < plat_count; p++)
            {
                if (clGetDeviceIDs(plat[p],
                                   dev_type,
                                   0,
                                   NULL,
                                   &dev_count[p]) == CL_SUCCESS)
                {
                    total_dev_count += dev_count[p];
                    dev[p] = new cl_device_id_t[dev_count[p]];
                    context[p] = new cl_context_t[dev_count[p]];
                    cmd[p] = new cl_command_queue_t[dev_count[p]];

                    if (clGetDeviceIDs(plat[p],
                                       dev_type,
                                       dev_count[p],
                                       dev[p],
                                       NULL) == CL_SUCCESS)
                    {
                        cl_context_properties_t properties[3] = {
                            CL_CONTEXT_PLATFORM,
                            (cl_context_properties_t) plat[p],
                            0
                        };

                        for (cl_uint_t d = 0; d < dev_count[p]; d++)
                        {
                            cl_int_t err;
                            context[p][d] = clCreateContext(properties,
                                                            1,
                                                            &dev[p][d],
                                                            NULL,
                                                            NULL,
                                                            &err);
                            if (err == CL_SUCCESS)
                            {
                                cmd[p][d] = clCreateCommandQueue(context[p][d],
                                                                 dev[p][d],
                                                                 CL_QUEUE_PROFILING_ENABLE,
                                                                 &err);
                                if (err != CL_SUCCESS)
                                {
                                    throw 1;
                                }
                            }
                            else
                            {
                                throw 2;
                            }
                        }
                    }
                    else
                    {
                        throw 3;
                    }
                }
                else
                {
                    throw 4;
                }
            }
        }
        else
        {
            throw 5;
        }
    }
    else
    {
        throw 6;
    }
}
Example #15
0
void create_context_on(const char *plat_name, const char*dev_name, cl_uint idx,
    cl_context *ctx, cl_command_queue *queue, int enable_profiling)
{
  cl_uint plat_count;

  CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count));

  cl_platform_id *platforms = 
    (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id));
  CHECK_SYS_ERROR(!platforms, "allocating platform array");

  CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL));
  for (cl_uint i = 0; i < plat_count; ++i)
  {
    char buf[100];
    CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, 
          sizeof(buf), buf, NULL));

    if (!plat_name || strstr(buf, plat_name))
    {
      cl_uint dev_count;
      CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
            0, NULL, &dev_count));

      cl_device_id *devices = 
        (cl_device_id *) malloc(dev_count*sizeof(cl_device_id));
      CHECK_SYS_ERROR(!devices, "allocating device array");

      CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
            dev_count, devices, NULL));

      for (cl_uint j = 0; j < dev_count; ++j)
      {
        char buf[100];
        CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME,
              sizeof(buf), buf, NULL));
        if (!dev_name || strstr(buf, dev_name))
        {
          if (idx == 0)
          {
            cl_platform_id plat = platforms[i];
            cl_device_id dev = devices[j];

            free(devices);
            free(platforms);

            cl_context_properties cps[3] = { 
              CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 };

            cl_int status;
            *ctx = clCreateContext(
                cps, 1, &dev, NULL, NULL, &status);
            CHECK_CL_ERROR(status, "clCreateContext");


            cl_command_queue_properties qprops = 0;
            if (enable_profiling)
              qprops |= CL_QUEUE_PROFILING_ENABLE;

            *queue = clCreateCommandQueue(*ctx, dev, qprops, &status);
            CHECK_CL_ERROR(status, "clCreateCommandQueue");

            return;
          }
          else
            --idx;
        }
      }

      free(devices);
    }
  }

  free(platforms);

  fputs("create_context_on: specified device not found.\n", stderr);
  abort();
}
Example #16
0
static sc_status
SetupOpenCLSessionForDeviceType(
	sc_session handle,
	sc_device_type device_type, 
	sc_uint device_count)
{
    sc_status status = 0;
    cl_int system_status = 0;
    cl_device_type system_device_type = 0;
	cl_uint system_device_count = 0;
	cl_uint system_platform_count = 0;
	cl_device_id *system_device_list = 0;
	cl_platform_id system_platform = 0;
    size_t return_size = 0;
    
    sc_session_t *session = (sc_session_t*)handle;
	
	system_status = clGetPlatformIDs(1, &system_platform, &system_platform_count); 
	if (system_status != CL_SUCCESS || system_platform_count < 1)
	{
		scError(NULL, SC_INVALID_PLATFORM, "Failed to locate platform!\n");
		return SC_INVALID_PLATFORM;
	}

	system_device_type = GetOpenCLDeviceType(device_type, &status);
	if (status != CL_SUCCESS)
	{
		scError(NULL, SC_INVALID_DEVICE_TYPE, "Invalid compute device type specified!\n");
		return SC_INVALID_DEVICE_TYPE;
	}
	
	system_status = clGetDeviceIDs(system_platform, system_device_type, 0, NULL, &system_device_count);
	if (system_status != CL_SUCCESS || system_device_count < 1)
	{
		scError(NULL, SC_DEVICE_NOT_AVAILABLE, "Failed to locate compute device!\n");
		return SC_DEVICE_NOT_AVAILABLE;
	}
	
	if(device_count)
		system_device_count = system_device_count > device_count ? device_count : system_device_count;
	
	system_device_list = scAllocate(NULL, sizeof(cl_device_id) * device_count);
	memset(system_device_list, 0, sizeof(cl_device_id) * device_count);
	
	system_status = clGetDeviceIDs(system_platform, system_device_type, system_device_count, system_device_list, &system_device_count);
	if (system_status != CL_SUCCESS)
	{
		scError(NULL, SC_DEVICE_NOT_AVAILABLE, "Failed to locate compute device!\n");
		return SC_DEVICE_NOT_AVAILABLE;
	}
	
	session->context = clCreateContext(NULL, system_device_count, system_device_list, NotifyCallback, session, &system_status);
    if (!session->context)
    {
        scError(handle, SC_INVALID_CONTEXT, "Failed to create compute context!\n");
        return SC_INVALID_CONTEXT;
    }

	session->platform = system_platform;
    system_status = clGetContextInfo(session->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id) * system_device_count, system_device_list, &return_size);
    if(system_status != CL_SUCCESS || return_size < 1)
    {
        scError(handle, SC_INVALID_DEVICE, "Failed to retrieve compute devices for context!\n");
        return SC_INVALID_DEVICE;
    }
    
    session->devices = system_device_list;
    session->units = system_device_count;
    session->queues = scAllocate(handle, sizeof(cl_command_queue) * session->units);
    if(!session->queues)
    {
        scError(handle, SC_OUT_OF_HOST_MEMORY, "Failed to allocate command queues!\n");
        return SC_OUT_OF_HOST_MEMORY;
    }

    for(sc_uint i = 0; i < session->units; i++)
    {
        cl_char vendor_name[256] = {0};
        cl_char device_name[256] = {0};

        system_status = clGetDeviceInfo(session->devices[i], CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &return_size);
        system_status|= clGetDeviceInfo(session->devices[i], CL_DEVICE_NAME, sizeof(device_name), device_name, &return_size);
        if (system_status != CL_SUCCESS)
        {
            scError(NULL, SC_INVALID_DEVICE_INFO, "Failed to retrieve device info!\n");
            return SC_INVALID_DEVICE_INFO;
        }

        scInfo(handle, "Adding device '%s' '%s' to compute session.\n", vendor_name, device_name);

        session->queues[i] = clCreateCommandQueue(session->context, session->devices[i], 0, &system_status);
        if (!session->queues[i])
        {
            scError(handle, SC_INVALID_COMMAND_QUEUE, "Failed to create a command queue!\n");
            return SC_INVALID_COMMAND_QUEUE;
        }
    }

	session->programs = scCreateMap(handle, SC_DEFAULT_MAP_SIZE);
	session->kernels = scCreateMap(handle, SC_DEFAULT_MAP_SIZE);
	session->mem = scCreateMap(handle, SC_DEFAULT_MAP_SIZE);
	session->valid = SC_TRUE;

    return SC_SUCCESS;
}
Example #17
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
    
    float data[DATA_SIZE];              // original data set given to device
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned
    
    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation
    
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    // Fill our data set with random float values
    //
    int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;
    
    // Connect to a compute device
    //
    int gpu = 1;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    
    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }
    
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }
    
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }
    
    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }
    
    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }
    
    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }
    
    // Write our data set into the input array in device memory
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }
    
    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }
    
    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    
    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }
    
    // Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);
    
    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
    
    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        if(results[i] == data[i] * data[i])
            correct++;
    }
    
    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);
    
    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    
    return 0;
}
Example #18
0
int main(int argc, char** argv) {

   /* OpenCL 1.1 data structures */
   cl_platform_id* platforms;
   cl_program program;
   cl_context context;

   /* OpenCL 1.1 scalar data types */
   cl_uint numOfPlatforms;
   cl_int  error;

   /*
    Prepare an array of __cl_float4 via dynamic memory allocation
    This will map to the native vector type which is SSE / SSE2 / AVX on
    Intel-compatible processors.
   */
   cl_float8* ud_in = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // input to device
   cl_float8* ud_out = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // output from device
   for( int i = 0; i < DATA_SIZE; ++i) {
       ud_in[i] = (cl_float8){(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i};
       ud_out[i] = (cl_float8){(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f};
   }

   /* 
      Get the number of platforms 
      Remember that for each vendor's SDK installed on the computer,
      the number of available platform also increased. 
    */
   error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }

   platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
   printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);

   error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }
   // Search for a CPU/GPU device through the installed platforms
   // Build a OpenCL program and do not run it.
   for(cl_uint i = 0; i < numOfPlatforms; i++ ) {

        cl_uint numOfDevices = 0;

        /* Determine how many devices are connected to your platform */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices);

        /* Load the information about your devices into the variable 'devices' */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        printf("Number of detected OpenCL devices: %d\n", numOfDevices);

	    /* Create a context */
        cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 };
	    context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a valid OpenCL context");
	        exit(1);
	    }

	    /* For each device, create a buffer and partition that data among the devices for compute! */
	    cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
	                                  sizeof(cl_float8) * DATA_SIZE, ud_in, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a buffer");
	        exit(1);
	    }

        int offset = 0; 
        for(int i = 0; i < numOfDevices; ++i, ++offset ) {
	        /* Load the two source files into temporary datastores */
	        const char *file_names[] = {"vectorization.cl"}; 
	        const int NUMBER_OF_FILES = 1;
	        char* buffer[NUMBER_OF_FILES];
	        size_t sizes[NUMBER_OF_FILES];
	        loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
	
	        /* Create the OpenCL program object */
	        program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);				
		    if(error != CL_SUCCESS) {
		      perror("Can't create the OpenCL program object");
		      exit(1);   
		    }

	        /* Build OpenCL program object and dump the error message, if any */
	        char *program_log;
	        size_t log_size;
            char* build_options = "-fbin-llvmir -fbin-amdil -fbin-exe";
	        error = clBuildProgram(program, 1, &devices[i], build_options, NULL, NULL);		
		    if(error != CL_SUCCESS) {
		      // If there's an error whilst building the program, dump the log
		      clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		      program_log = (char*) malloc(log_size+1);
		      program_log[log_size] = '\0';
		      clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 
		            log_size+1, program_log, NULL);
		      printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
		      free(program_log);
		      exit(1);
		    }
	  
	        /* Query the program as to how many kernels were detected */
	        cl_uint numOfKernels;
	        error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels);
	        if (error != CL_SUCCESS) {
	            perror("Unable to retrieve kernel count from program");
	            exit(1);
	        }
	        cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels);
	        error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL);

            /* Loop thru each kernel and execute on device */
	        for(cl_uint j = 0; j < numOfKernels; j++) {
	            char kernelName[32];
	            cl_uint argCnt;
	            clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL);
	            clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL);
	            printf("Kernel name: %s with arity: %d\n", kernelName, argCnt);
	            printf("About to create command queue and enqueue this kernel...\n");
	
	            /* Create a command queue */
	            cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create command-queue");
	                exit(1);
	            }
	
                /* Create a buffer and copy the data from the main buffer */
	            cl_mem outobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 
	                                                sizeof(cl_float8) * DATA_SIZE, 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create sub-buffer object");
	                exit(1);
	            }

	            /* Let OpenCL know that the kernel is suppose to receive an argument */
	            error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj);
	            error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to set buffer object in kernel");
	                exit(1);
	            }
	
	            /* Enqueue the kernel to the command queue */
	            error = clEnqueueTask(cQ, kernels[j], 0, NULL, NULL);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to enqueue task to command-queue");
	                exit(1);
	            }
	            printf("Task has been enqueued successfully!\n");

	            /* Enqueue the read-back from device to host */
	            error = clEnqueueReadBuffer(cQ, outobj,
	                                         CL_TRUE,               // blocking read
	                                         0,                      // read from the start
	                                         sizeof(cl_float8)*DATA_SIZE,          // how much to copy
	                                         ud_out, 0, NULL, NULL);
                /* Check the returned data */
	            if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) {
	                printf("Check passed!\n");
	            } else printf("Check failed!\n");
	
	            /* Release the command queue */
	            clReleaseCommandQueue(cQ);
	            clReleaseMemObject(outobj);
	        } 

        /* Clean up */
        
        for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); }
        for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); }
        clReleaseProgram(program);
    }// end of device loop and execution
    
	    clReleaseMemObject(inobj);
        clReleaseContext(context);
   }// end of platform loop

   free(ud_in);
   free(ud_out);
}
Example #19
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufY;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);
	int lenY = 1 + (N-1)*abs(incy);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);

	printResult();

    /* Call clblas function. */
    err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event);
//	printf("here\n");
    if (err != CL_SUCCESS) {
        printf("clblasSrot() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
                                    Y, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
                                    X, 0, NULL, NULL);

        /* At this point you will get the result of SROT placed in vector Y. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Example #20
0
void initopencl(void) {
	
	int i;


	// Get Platform and Device Info
	CL_CHECK(clGetPlatformIDs(1, &platform_id, &num_platforms));
	
	// Currently this program only runs on a SINGLE GPU.
	CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices));
	printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
	printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
	
	
	char buffer[10240];
	cl_uint buf_uint;
	cl_ulong buf_ulong;
	printf("  -- %d --\n", i);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_NAME = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_VENDOR = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_VERSION = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
	printf("  DRIVER_VERSION = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
	printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
	printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
	printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);

	if (num_devices == 0)
	{	
		fprintf(stderr, "No Devices found that can run OpenCL.");
		exit(0);	
	}
	// Create OpenCL context
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
	if (ret != CL_SUCCESS) {
		
		fprintf(stderr, "Error creating context: Function returned %d \n\n", ret);
		exit(1);
	
	}
	// Create Command Queue
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
	if (ret != CL_SUCCESS) {
		
		fprintf(stderr, "Error creating command Queue: Function returned %d \n\n", ret);
		exit(1);
	
	}
	
	// Load the kernel source code into the array source_str
	FILE *fp;
	char *source_str;
	size_t source_size;
	
	fp = fopen("integrate.cl", "r");
	if (!fp) {
	    fprintf(stderr, "Failed to load kernel.\n");
	    exit(1);
	}
	
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose( fp );	
	
	
	// Create a program from the kernel source
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);	
    if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a program for integration3D. %d \n\n", (int)ret);
			exit(1);
	}
    // Build the program
    
    ret = clBuildProgram(program, 1, &device_id, "-DUSE_DOUBLE=1", NULL, NULL); 
    if (ret != CL_SUCCESS)
    {
    
    	size_t length;
    	char buffer[10240];
    	clGetProgramBuildInfo(program, 1, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
    	fprintf(stderr, "Error returned %d. \n\n", (int)ret);
    	printf("Error Log: \n\n %s \n\n", buffer);
    	exit(0);
    }
	
/*    // Create the OpenCL kernel (compute_points_Unstructure3D_1)
    kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n");
			exit(1);
	}
*/	
	// Create the OpenCL kernel (check_int)
    kernel2 = clCreateKernel(program, "check_int", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for check_int. %d \n\n", (int)ret);
			exit(1);
	}
	
    
    // Create the OpenCL kernel (compute_points_Unstructure3D_1)
    kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
	kernel3 = clCreateKernel(program, "initialize_timestep3D", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for initialize_timestep3D. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
    kernel4 = clCreateKernel(program, "LocalSearch3D", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
    kernel5 = clCreateKernel(program, "compute_points_Unstructure3D_2", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n");
			exit(1);
	}
	
	
	
	printf("\n\n");
}
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads)
{
	/* initialize highlighting */
	if (!g_highlightInitialized) {
		if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead);
		if (g_highlightedNodes)     MEM_freeN(g_highlightedNodes);

		g_highlightedNodesRead = NULL;
		g_highlightedNodes = NULL;

		COM_startReadHighlights();

		g_highlightInitialized = true;
	}

#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
	/* deinitialize if number of threads doesn't match */
	if (g_cpudevices.size() != num_cpu_threads) {
		Device *device;

		while (g_cpudevices.size() > 0) {
			device = g_cpudevices.back();
			g_cpudevices.pop_back();
			device->deinitialize();
			delete device;
		}

		g_cpuInitialized = false;
	}

	/* initialize CPU threads */
	if (!g_cpuInitialized) {
		for (int index = 0; index < num_cpu_threads; index++) {
			CPUDevice *device = new CPUDevice();
			device->initialize();
			g_cpudevices.push_back(device);
		}

		g_cpuInitialized = true;
	}

#ifdef COM_OPENCL_ENABLED
	/* deinitialize OpenCL GPU's */
	if (use_opencl && !g_openclInitialized) {
		g_context = NULL;
		g_program = NULL;

		if (clewInit() != CLEW_SUCCESS) /* this will check for errors and skip if already initialized */
			return;

		if (clCreateContextFromType) {
			cl_uint numberOfPlatforms = 0;
			cl_int error;
			error = clGetPlatformIDs(0, 0, &numberOfPlatforms);
			if (error == -1001) { }   /* GPU not supported */
			else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
			if (G.f & G_DEBUG) printf("%u number of platforms\n", numberOfPlatforms);
			cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__);
			error = clGetPlatformIDs(numberOfPlatforms, platforms, 0);
			unsigned int indexPlatform;
			for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) {
				cl_platform_id platform = platforms[indexPlatform];
				cl_uint numberOfDevices = 0;
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices);
				if (numberOfDevices <= 0)
					continue;

				cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__);
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0);

				g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error);
				if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
				const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL};
				g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error);
				error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0);
				if (error != CL_SUCCESS) {
					cl_int error2;
					size_t ret_val_size = 0;
					printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__);
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					build_log[ret_val_size] = '\0';
					printf("%s", build_log);
					MEM_freeN(build_log);
				}
				else {
					unsigned int indexDevices;
					for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) {
						cl_device_id device = cldevices[indexDevices];
						cl_int vendorID = 0;
						cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL);
						if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); }
						OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID);
						clDevice->initialize();
						g_gpudevices.push_back(clDevice);
					}
				}
				MEM_freeN(cldevices);
			}
			MEM_freeN(platforms);
		}

		g_openclInitialized = true;
	}
#endif
#endif
}
int main()
{
	srand(unsigned(time(nullptr)));
	int err;                            // error code returned from api calls

	cl_device_id device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_kernel kernel;                   // compute kernel

										// OpenCL device memory for matrices
	cl_mem d_A;
	cl_mem d_B;
	cl_mem d_C;

	// set seed for rand()
	srand(2014);

	//Allocate host memory for matrices A and B
	unsigned int size_A = WA * HA;
	unsigned int mem_size_A = sizeof(float) * size_A;
	float* h_A = (float*)malloc(mem_size_A);

	unsigned int size_B = WB * HB;
	unsigned int mem_size_B = sizeof(float) * size_B;
	float* h_B = (float*)malloc(mem_size_B);

	//Initialize host memory
	randomMemInit(h_A, size_A);
	randomMemInit(h_B, size_B);

	//Allocate host memory for the result C
	unsigned int size_C = WC * HC;
	unsigned int mem_size_C = sizeof(float) * size_C;
	float* h_C = (float*)malloc(mem_size_C);

	printf("Initializing OpenCL device...\n");

	cl_uint dev_cnt = 0;
	clGetPlatformIDs(0, 0, &dev_cnt);

	cl_platform_id platform_ids[100];
	clGetPlatformIDs(dev_cnt, platform_ids, NULL);

	// Connect to a compute device
	int gpu = 1;
	err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
	if (err != CL_SUCCESS){
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

	// Create a compute context 
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context){
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	// Create a command commands
	commands = clCreateCommandQueue(context, device_id, 0, &err);
	if (!commands){
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source file
	char *KernelSource;
	long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource);
	if (lFileSize < 0L){
		perror("File read failed");
		return 1;
	}
	//const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl");

	program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err);
	if (!program){
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS){
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		exit(1);
	}

	// Create the compute kernel in the program we wish to run
	kernel = clCreateKernel(program, "matrixMul", &err);
	if (!kernel || err != CL_SUCCESS){
		printf("Error: Failed to create compute kernel!\n");
		exit(1);
	}

	// Create the input and output arrays in device memory for our calculation
	d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err);
	d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err);
	d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err);

	if (!d_A || !d_B || !d_C){
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}

	printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB);

	//Launch OpenCL kernel
	size_t localWorkSize[2], globalWorkSize[2];

	int wA = WA;
	int wC = WC;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B);
	err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA);
	err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC);

	if (err != CL_SUCCESS){
		printf("Error: Failed to set kernel arguments! %d\n", err);
		exit(1);
	}

	localWorkSize[0] = 16;
	localWorkSize[1] = 16;
	globalWorkSize[0] = 1024;
	globalWorkSize[1] = 1024;

	err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to execute kernel! %d\n", err);
		exit(1);
	}

	//Retrieve result from device
	err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to read output array! %d\n", err);
		exit(1);
	}
	//print table A

	printf("\nMatrix A\n");
	for (int i = 0; i < size_A; i++){
		printf("%f\t", h_A[i]);
		if (((i + 1) % WA) == 0)
			printf("\n");
	}

	//print table B

	printf("\nMatrix B\n");
	for (int i = 0; i < size_B; i++){
		printf("%f\t", h_B[i]);
		if (((i + 1) % WB) == 0)
			printf("\n");
	}

	//print out the results

	printf("\nMatrix C (Results)\n");
	for (int i = 0; i < size_C; i++){
		printf("%f\t", h_C[i]);
		if (((i + 1) % WC) == 0)
			printf("\n");
	}
	printf("\n");


	printf("Matrix multiplication completed...\n");

	//Shutdown and cleanup
	free(h_A);
	free(h_B);
	free(h_C);

	clReleaseMemObject(d_A);
	clReleaseMemObject(d_C);
	clReleaseMemObject(d_B);

	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	std::cin.clear();
	std::cin.sync();
	std::cin.get();
}
Example #23
0
int main(void) {
//time meassuring
  	struct timeval tvs;

//variables
	int 	Nx=1024;
	int		Ny=1024;
	int 	plotnum=0;
	int	  	Tmax=2;
	int 	plottime=0;
	int	  	plotgap=1;
	double	Lx=1.0;
	double 	Ly=1.0;
	double	dt=0.0;	
	double	A=0.0;
	double	B=0.0;
	double	Du=0.0;
	double	Dv=0.0;
//splitting coefficients
	double	a=0.5;	
	double 	b=0.5;
	double 	c=1.0;
//loop counters	
	int i=0;
	int j=0;
	int n=0;

	double*umax=NULL;
	double*vmax=NULL;
	parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B);
	plottime=plotgap;
	vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double));
	umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double));
//openCL variables
    cl_platform_id *platform_id = NULL;
    cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL;
	cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL;
    cl_int ret;
    cl_uint num_platforms;
// Detect how many platforms there are.
	ret = clGetPlatformIDs(0, NULL, &num_platforms);
// Allocate enough space for the number of platforms.
	platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id));
// Store the platforms
	ret = clGetPlatformIDs(num_platforms, platform_id, NULL);
	printf("Found %d platform(s)!\n",num_platforms);
    cl_uint *num_devices;
	num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint));
    cl_device_id **device_id = NULL;
	device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*));
// Detect number of devices in the platforms
	for(i=0;i<num_platforms;i++){
		char buf[65536];
		size_t size;
		ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size);
		printf("%s\n",buf);
		ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices);
		printf("Found %d device(s) on platform %d!\n", num_devices[i],i);
		ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size);
		printf("%s ",buf);
// Store numDevices from platform
		device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id));
		ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL);
		for(j=0;j<num_devices[i];j++){
			ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size);
			printf("%s (%d,%d)\n",buf,i,j);
		}
	}
//create context and command_queue
    cl_context context = NULL;
   	cl_command_queue command_queue = NULL;
//Which platform and device do i choose?
	int	chooseplatform=0;
	int	choosedevice=0;	  
	printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice);
	context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret);
	if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); }
	command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret);
	if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); }

//OpenCL arrays
    cl_mem cl_u = NULL,cl_v = NULL;
   	cl_mem cl_uhat = NULL, cl_vhat = NULL;
    cl_mem cl_kx = NULL, cl_ky = NULL;

//FFT
	clfftPlanHandle planHandle;
    cl_mem tmpBuffer = NULL;
	fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny);

//allocate gpu memory/
	cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret);
	cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret);

	printf("allocated space\n");
//load the kernels
	loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies");
	loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); 
	loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); 
	loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); 
	loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); 

	size_t global_work_size[1] = {Nx*Ny};
	size_t global_work_size_X[1] = {Nx};
	size_t global_work_size_Y[1] = {Ny};
//frequencies
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//printCL(&cl_kx,&command_queue,Nx,1);
//printCL(&cl_ky,&command_queue,1,Ny);
//inintial data
    ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly);
    ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//make output
    writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
    umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	printf("Got initial data, starting timestepping\n");
	mtime_s(&tvs);

	for(n=0;n<=Tmax;n++){
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);	

//nonlinearpart_b
    ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

//linear
	fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer);
	fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer);
//printf("A%f,B%f\n",A,B);
    ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat);
    ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat);
	ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx);
	ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky);
	ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du);
	ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv);
	ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A);
	ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B);
	ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c);
	ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer);
  	fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer);

//nonlinearpart_b
    ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);		
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);	
// done
	if(n==plottime){
		printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]);
		plottime=plottime+plotgap;
		plotnum=plotnum+1;
   	 	writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    	writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
        umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
        vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	}
}//end timestepping

	printf("Finished time stepping\n");
	mtime_e(&tvs,"Programm took:");
	writearray(umax,(Tmax/plotgap)+1,"u");
	writearray(vmax,(Tmax/plotgap)+1,"v");
	free(umax);
	free(vmax);	

	clReleaseMemObject(cl_u);
	clReleaseMemObject(cl_v);
	clReleaseMemObject(cl_uhat);
	clReleaseMemObject(cl_vhat);
	clReleaseMemObject(cl_kx);
	clReleaseMemObject(cl_ky);

    ret = clReleaseKernel(initialdata); 
    ret = clReleaseKernel(frequencies); 
    ret = clReleaseKernel(linearpart); 
    ret = clReleaseKernel(nonlinearpart_a);
    ret = clReleaseKernel(nonlinearpart_b);

	fftdestroy(&planHandle, &tmpBuffer);

	ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

	for(i=0;i<num_platforms;i++){free(device_id[i]);}
	free(device_id);
	free(platform_id);
	free(num_devices);
	printf("Program execution complete\n");

	return 0;
}
Example #24
0
int main() {
	char buf[]="Hello, World!";
	size_t srcsize, worksize=strlen(buf);
	
	cl_int error;
	cl_platform_id platform;
	cl_device_id device;
	cl_uint platforms, devices;

	// Fetch the Platform and Device IDs; we only want one.
	error=clGetPlatformIDs(1, &platform, &platforms);
	error=clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &devices);
	cl_context_properties properties[]={
		CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
		0};
	// Note that nVidia's OpenCL requires the platform property
	cl_context context=clCreateContext(properties, 1, &device, NULL, NULL, &error);
	cl_command_queue cq = clCreateCommandQueue(context, device, 0, &error);
	
	rot13(buf);	// scramble using the CPU
	puts(buf);	// Just to demonstrate the plaintext is destroyed

	//char src[8192];
	//FILE *fil=fopen("rot13.cl","r");
	//srcsize=fread(src, sizeof src, 1, fil);
	//fclose(fil);
	
	const char *src=rot13_cl;
	srcsize=strlen(rot13_cl);

	const char *srcptr[]={src};
	// Submit the source code of the rot13 kernel to OpenCL
	cl_program prog=clCreateProgramWithSource(context,
		1, srcptr, &srcsize, &error);
	// and compile it (after this we could extract the compiled version)
	error=clBuildProgram(prog, 0, NULL, "", NULL, NULL);

	// Allocate memory for the kernel to work with
	cl_mem mem1, mem2;
	mem1=clCreateBuffer(context, CL_MEM_READ_ONLY, worksize, NULL, &error);
	mem2=clCreateBuffer(context, CL_MEM_WRITE_ONLY, worksize, NULL, &error);
	
	// get a handle and map parameters for the kernel
	cl_kernel k_rot13=clCreateKernel(prog, "rot13", &error);
	clSetKernelArg(k_rot13, 0, sizeof(mem1), &mem1);
	clSetKernelArg(k_rot13, 1, sizeof(mem2), &mem2);

	// Target buffer just so we show we got the data from OpenCL
	char buf2[sizeof buf];
	buf2[0]='?';
	buf2[worksize]=0;

	// Send input data to OpenCL (async, don't alter the buffer!)
	error=clEnqueueWriteBuffer(cq, mem1, CL_FALSE, 0, worksize, buf, 0, NULL, NULL);
	// Perform the operation
	error=clEnqueueNDRangeKernel(cq, k_rot13, 1, NULL, &worksize, &worksize, 0, NULL, NULL);
	// Read the result back into buf2
	error=clEnqueueReadBuffer(cq, mem2, CL_FALSE, 0, worksize, buf2, 0, NULL, NULL);
	// Await completion of all the above
	error=clFinish(cq);
	
	// Finally, output out happy message.
	puts(buf2);
}
Example #25
0
int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 100000;
 
    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;
 
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
 
    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);
 
    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);
 
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }
 
    size_t globalSize, localSize;
    cl_int err;
 
    // Number of work items in each local work group
    localSize = 64;
 
    // Number of total work items - localSize must be devisor
    globalSize = ceil(n/(float)localSize)*localSize;
 
    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
 
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 
    // Create a context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 
    // Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);
 
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
 
    // Build the program executable
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);
 
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
 
    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                                   bytes, h_a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                                   bytes, h_b, 0, NULL, NULL);
 
    // Set the arguments to our compute kernel
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
 
    // Execute the kernel over the entire range of the data set 
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);
 
    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);
 
    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );
 
    //Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);
 
    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Example #26
0
/**
 * @brief Creates an array of objects containing the OpenCL variables of each device
 * @param trDataBase The training database which will contain the instances and the features
 * @param selInstances The instances choosen as initial centroids
 * @param transposedTrDataBase The training database already transposed
 * @param conf The structure with all configuration parameters
 * @return A pointer containing the objects
 */
CLDevice *createDevices(const float *const trDataBase, const int *const selInstances, const float *const transposedTrDataBase, Config *const conf) {


	/********** Find the OpenCL devices specified in configuration ***********/

	// OpenCL variables
	cl_uint numPlatformsDevices;
	cl_device_type deviceType;
	cl_program program;
	cl_kernel kernel;
	cl_int status;

	// Others variables
	auto allDevices = getAllDevices();
	CLDevice *devices = new CLDevice[conf -> nDevices + (conf -> ompThreads > 0)];

	for (int dev = 0; dev < conf -> nDevices; ++dev) {

		bool found = false;
		for (int allDev = 0; allDev < allDevices.size() && !found; ++allDev) {

			// Get the specified OpenCL device
			char dbuff[120];
			check(clGetDeviceInfo(allDevices[allDev], CL_DEVICE_NAME, sizeof(dbuff), dbuff, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_NAME);

			// If the device exists...
			if (conf -> devices[dev] == dbuff) {
				devices[dev].device = allDevices[allDev];
				devices[dev].deviceName = dbuff;
				check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_TYPE, sizeof(cl_device_type), &(devices[dev].deviceType), NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_TYPE);


				/********** Device local memory usage ***********/

				long int usedMemory = conf -> nFeatures * sizeof(cl_uchar); // Chromosome of the individual
				usedMemory += conf -> trNInstances * sizeof(cl_uchar); // Mapping buffer
				usedMemory += conf -> K * conf -> nFeatures * sizeof(cl_float); // Centroids buffer
				usedMemory += conf -> trNInstances * sizeof(cl_float); // DistCentroids buffer
				usedMemory += conf -> K * sizeof(cl_int); // Samples_in_k buffer

				// Get the maximum local memory size
				long int maxMemory;
				check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long int), &maxMemory, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_MAXMEM);

				// Avoid exceeding the maximum local memory available. 1024 bytes of margin
				check(usedMemory > maxMemory - 1024, "%s:\n\tMax memory: %ld bytes\n\tAllow memory: %ld bytes\n\tUsed memory: %ld bytes\n", CL_ERROR_DEVICE_LOCALMEM, maxMemory, maxMemory - 1024, usedMemory);


				/********** Create context ***********/

				devices[dev].context = clCreateContext(NULL, 1, &(devices[dev].device), 0, 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_CONTEXT);


				/********** Create Command queue ***********/

				devices[dev].commandQueue = clCreateCommandQueue(devices[dev].context, devices[dev].device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_QUEUE);


				/********** Create kernel ***********/

				// Open the file containing the kernels
				std::fstream kernels(conf -> kernelsFileName.c_str(), std::fstream::in);
				check(!kernels.is_open(), "%s\n", CL_ERROR_FILE_OPEN);

				// Obtain the size
				kernels.seekg(0, kernels.end);
				size_t fSize = kernels.tellg();
				kernels.seekg(0, kernels.beg);

				char *kernelSource = new char[fSize];
				kernels.read(kernelSource, fSize);
				kernels.close();

				// Create program
				program = clCreateProgramWithSource(devices[dev].context, 1, (const char **) &kernelSource, &fSize, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_BUILD);

				// Build program for the device in the context
				char buildOptions[196];
				sprintf(buildOptions, "-I include -D N_INSTANCES=%d -D N_FEATURES=%d -D N_OBJECTIVES=%d -D K=%d -D MAX_ITER_KMEANS=%d", conf -> trNInstances, conf -> nFeatures, conf -> nObjectives, conf -> K, conf -> maxIterKmeans);
				if (clBuildProgram(program, 1, &(devices[dev].device), buildOptions, 0, 0) != CL_SUCCESS) {
					char buffer[4096];
					fprintf(stderr, "Error: Could not build the program\n");
					check(clGetProgramBuildInfo(program, devices[dev].device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_ERRORS);
					check(true, "%s\n", buffer);
				}

				// Create kernel
				const char *kernelName = (devices[dev].deviceType == CL_DEVICE_TYPE_GPU) ? "kmeansGPU" : "";
				devices[dev].kernel = clCreateKernel(program, kernelName, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_BUILD);


				/******* Work-items *******/

				devices[dev].computeUnits = atoi(conf -> computeUnits[dev].c_str());
				devices[dev].wiLocal = atoi(conf -> wiLocal[dev].c_str());
				devices[dev].wiGlobal = devices[dev].computeUnits * devices[dev].wiLocal;


				/******* Create and write the databases and centroids buffers. Create the subpopulations buffer. Set kernel arguments *******/

				// Create buffers
				devices[dev].objSubpopulations = clCreateBuffer(devices[dev].context, CL_MEM_READ_WRITE, conf -> familySize * sizeof(Individual), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_SUBPOPS);

				devices[dev].objTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TRDB);

				devices[dev].objTransposedTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TTRDB);

				devices[dev].objSelInstances = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> K * sizeof(cl_int), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_CENTROIDS);

				// Sets kernel arguments
				check(clSetKernelArg(devices[dev].kernel, 0, sizeof(cl_mem), (void *)&(devices[dev].objSubpopulations)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT1);

				check(clSetKernelArg(devices[dev].kernel, 1, sizeof(cl_mem), (void *)&(devices[dev].objSelInstances)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT2);

				check(clSetKernelArg(devices[dev].kernel, 2, sizeof(cl_mem), (void *)&(devices[dev].objTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT3);

				check(clSetKernelArg(devices[dev].kernel, 5, sizeof(cl_mem), (void *)&(devices[dev].objTransposedTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT6);

				// Write buffers
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), trDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TRDB);
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objSelInstances, CL_FALSE, 0, conf -> K * sizeof(cl_int), selInstances, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_CENTROIDS);
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTransposedTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), transposedTrDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TTRDB);

				// Resources used are released
				delete[] kernelSource;
				clReleaseProgram(program);

				found = true;
				allDevices.erase(allDevices.begin() + allDev);
			}
		}

		check(!found, "%s\n", CL_ERROR_DEVICE_FOUND);
	}


	/********** Add the CPU if has been enabled in configuration ***********/

	if (conf -> ompThreads > 0) {
		devices[conf -> nDevices].deviceType = CL_DEVICE_TYPE_CPU;
		devices[conf -> nDevices].computeUnits = conf -> ompThreads;
		++(conf -> nDevices);
	}

	return devices;
}
Example #27
0
/** 
 * @brief Create a new OpenCL zone, which will contain complete information for an OpenCL execution session on a specific device. 
 * 
 * @param deviceType Device type (OpenCL bitfield).
 * @param numQueues Number of command queues.
 * @param queueProperties Properties for the command queues.
 * @param devSel Pointer to function which will select device, if more than one is available.
 * @param dsExtraArg Extra argument for (*deviceSelector) function.
 * @param err Error structure, to be populated if an error occurs.
 * @return OpenCL zone or NULL if device wasn't properly initialized.
 */
CLUZone* clu_zone_new(cl_uint deviceType, cl_uint numQueues, cl_int queueProperties, clu_device_selector devSel, void* dsExtraArg, GError **err) {
	
	/* OpenCL status variable. */
	cl_int status;
	
	/* OpenCL zone to initialize and return */
	CLUZone* zone;
	
	/* Information about devices */
	CLUDeviceInfo devInfos[CLU_MAX_DEVICES_TOTAL];

	/* Number of devices. */
	cl_uint numDevices;

	/* Index of device information */
	cl_int deviceInfoIndex;

	/* Context properties, */
	cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, 0, 0};

	/* List of platform Ids. */
	cl_platform_id platfIds[CLU_MAX_PLATFORMS];

	/* Number of platforms. */
	cl_uint numPlatforms;

	/* Total number of devices. */
	unsigned int totalNumDevices;

	/* Device IDs for a given platform. */
	cl_device_id devIds[CLU_MAX_DEVICES_PER_PLATFORM];
	
	/* Initialize zone */
	zone = (CLUZone*) malloc(sizeof(CLUZone));
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		NULL == zone, 
		CLU_ERROR_NOALLOC, 
		error_handler, 
		"Unable to allocate memory for OpenCL zone"
	);
	zone->context = NULL;
	zone->queues = NULL;
	zone->program = NULL;
	zone->device_info.device_id = NULL;
	zone->device_info.platform_id = NULL;
	zone->device_info.device_name[0] = '\0';
	zone->device_info.device_vendor[0] = '\0';
	zone->device_info.platform_name[0] = '\0';
		
	/* Get number of platforms */
	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != status, 
		CLU_OCL_ERROR, 
		error_handler, 
		"clu_zone_new: get number of platforms (OpenCL error %d: %s).",
		status,
		clerror_get(status));

	/* Get existing platforms */
	status = clGetPlatformIDs(numPlatforms, platfIds, NULL);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != status, 
		CLU_OCL_ERROR, 
		error_handler, 
		"clu_zone_new: get platform Ids (OpenCL error %d: %s).", 
		status,
		clerror_get(status));

	/* Cycle through platforms, get specified devices in existing platforms */
	totalNumDevices = 0;
	for(unsigned int i = 0; i < numPlatforms; i++) 	{
		/* Get specified devices for current platform */
		status = clGetDeviceIDs(
			platfIds[i], 
			deviceType, 
			CLU_MAX_DEVICES_PER_PLATFORM, 
			devIds, 
			&numDevices);
		if (status != CL_DEVICE_NOT_FOUND) {
			/* At least one device found, lets take note */
			gef_if_error_create_goto(
				*err, 
				CLU_UTILS_ERROR, 
				CL_SUCCESS != status, 
				CLU_OCL_ERROR, 
				error_handler, 
				"clu_zone_new: get device Ids (OpenCL error %d: %s).", 
				status,
				clerror_get(status));
			for (unsigned int j = 0; j < numDevices; j++) {
				/* Keep device and platform IDs. */
				devInfos[totalNumDevices].device_id = devIds[j];
				devInfos[totalNumDevices].platform_id = platfIds[i];
				/* Get device name. */
				status = clGetDeviceInfo(
					devIds[j], 
					CL_DEVICE_NAME, 
					sizeof(devInfos[totalNumDevices].device_name), 
					devInfos[totalNumDevices].device_name, 
					NULL);
				gef_if_error_create_goto(
					*err, 
					CLU_UTILS_ERROR, 
					CL_SUCCESS != status, 
					CLU_OCL_ERROR, 
					error_handler, 
					"clu_zone_new: get device name info (OpenCL error %d: %s).",
					status,
					clerror_get(status));
				/* Get device vendor. */
				status = clGetDeviceInfo(
					devIds[j], 
					CL_DEVICE_VENDOR, 
					sizeof(devInfos[totalNumDevices].device_vendor), 
					devInfos[totalNumDevices].device_vendor, 
					NULL);
				gef_if_error_create_goto(
					*err, 
					CLU_UTILS_ERROR, 
					CL_SUCCESS != status, 
					CLU_OCL_ERROR, 
					error_handler, 
					"clu_zone_new: get device vendor info (OpenCL error %d: %s).", 
					status,
					clerror_get(status));
				/* Get platform name. */
				status = clGetPlatformInfo(
					platfIds[i],
					CL_PLATFORM_VENDOR,
					sizeof(devInfos[totalNumDevices].platform_name),
					devInfos[totalNumDevices].platform_name,
					NULL);
				gef_if_error_create_goto(
					*err, 
					CLU_UTILS_ERROR, 
					CL_SUCCESS != status, 
					CLU_OCL_ERROR, 
					error_handler, 
					"clu_zone_new: get platform info (OpenCL error %d: %s).",
					status,
					clerror_get(status));
				/* Increment total number of found devices. */
				totalNumDevices++;
			}
		}
	}
	
	/* Check whether any devices of the specified type were found */
	if (totalNumDevices == 0) {
		/* No devices of the specified type where found, return with error. */
		gef_if_error_create_goto(
			*err, 
			CLU_UTILS_ERROR, 
			1, 
			CLU_ERROR_DEVICE_NOT_FOUND, 
			error_handler, 
			"clu_zone_new: device not found.");
	} else {
		/* Several compatible devices found, choose one with given selector function. */
		deviceInfoIndex = devSel(devInfos, totalNumDevices, dsExtraArg);
		/* Test return value of selector function (if it is out of range, 
		 * there is a programming error). */
		g_assert_cmpint(deviceInfoIndex, >=, -1);
		g_assert_cmpint(deviceInfoIndex, <, totalNumDevices);
		/* If selector function returned -1, then no device is selectable. */
		if (deviceInfoIndex == -1) {
			gef_if_error_create_goto(
				*err, 
				CLU_UTILS_ERROR, 
				1, 
				CLU_ERROR_DEVICE_NOT_FOUND, 
				error_handler, 
				"clu_zone_new: specified device not found.");
		}
	}

	/* Store info about the selected device and platform. */
	zone->device_type = deviceType;
	zone->device_info = devInfos[deviceInfoIndex];

	/* Determine number of compute units for that device */
	status = clGetDeviceInfo(
		zone->device_info.device_id, 
		CL_DEVICE_MAX_COMPUTE_UNITS, 
		sizeof(cl_uint), 
		&zone->cu, 
		NULL);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != status, 
		CLU_OCL_ERROR, 
		error_handler, 
		"clu_zone_new: get target device info (OpenCL error %d: %s).", 
		status,
		clerror_get(status));
	
	/* Create a context on that device. */
	cps[1] = (cl_context_properties) devInfos[deviceInfoIndex].platform_id;
	zone->context = clCreateContext(cps, 1, &zone->device_info.device_id, NULL, NULL, &status);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != status, 
		CLU_OCL_ERROR, 
		error_handler, 
		"clu_zone_new: creating context (OpenCL error %d: %s).", 
		status,
		clerror_get(status));
	
	/* Create the specified command queues on that device */
	zone->numQueues = numQueues;
	zone->queues = (cl_command_queue*) malloc(numQueues * sizeof(cl_command_queue));
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		NULL == zone->queues, 
		CLU_ERROR_NOALLOC, 
		error_handler, 
		"Unable to allocate memory to keep OpenCL command queues in Zone."
	);
	
	for (unsigned int i = 0; i < numQueues; i++) {
		zone->queues[i] = clCreateCommandQueue(
			zone->context, 
			zone->device_info.device_id, 
			queueProperties, 
			&status);
		gef_if_error_create_goto(
			*err, 
			CLU_UTILS_ERROR, 
			CL_SUCCESS != status, 
			CLU_OCL_ERROR, 
			error_handler, 
			"clu_zone_new: creating command queue (OpenCL error %d: %s).", 
			status,
			clerror_get(status));
	}

	/* If we got here, everything is OK. */
	g_assert (err == NULL || *err == NULL);
	goto finish;
	
error_handler:
	/* If we got here there was an error, verify that it is so. */
	g_assert (err == NULL || *err != NULL);
	/* Free OpenCL zone. */
	if (zone != NULL) {
		clu_zone_free(zone);
		zone = NULL;
	}

finish:	

	/* Return. */
	return zone;

}
Example #28
0
	OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_)
	  : Device(stats)
	{
		background = background_;
		cpPlatform = NULL;
		cxContext = NULL;
		cqCommandQueue = NULL;
		cpProgram = NULL;
		ckPathTraceKernel = NULL;
		ckFilmConvertKernel = NULL;
		null_mem = 0;
		device_initialized = false;

		/* setup platform */
		cl_uint num_platforms;

		ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
		if(opencl_error(ciErr))
			return;

		if(num_platforms == 0) {
			opencl_error("OpenCL: no platforms found.");
			return;
		}

		ciErr = clGetPlatformIDs(1, &cpPlatform, NULL);
		if(opencl_error(ciErr))
			return;

		char name[256];
		clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
		platform_name = name;

		/* get devices */
		vector<cl_device_id> device_ids;
		cl_uint num_devices;

		if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), 0, NULL, &num_devices)))
			return;

		if(info.num > num_devices) {
			if(num_devices == 0)
				opencl_error("OpenCL: no devices found.");
			else
				opencl_error("OpenCL: specified device not found.");
			return;
		}

		device_ids.resize(num_devices);
		
		if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL)))
			return;

		cdDevice = device_ids[info.num];

		/* create context */
		cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr);
		if(opencl_error(ciErr))
			return;

		cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
		if(opencl_error(ciErr))
			return;

		null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
		device_initialized = true;
	}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'clz_uint16.cl' */
        source_code = read_buffer("clz_uint16.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "clz_uint16", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_uint16 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_uint16));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_uint16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uint16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uint16), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_uint16 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_uint16));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_uint16));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_uint16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_uint16), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_uint16));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Example #30
0
int32_t init_kernel_platform() {

	cl_uint plat_num;
	cl_platform_id plat_id = NULL;
	cl_uint dev_num = 0;
	cl_device_id *devices;

	ret = clGetPlatformIDs(0, NULL, &plat_num);
	if (ret < 0) {
		LOGD("MU1 Error: Getting plat_ids!\n");
		return -1;
	}

	if(plat_num > 0)
	{
		cl_platform_id* plat_ids = (cl_platform_id* )malloc(plat_num* sizeof(cl_platform_id));
		ret = clGetPlatformIDs(plat_num, plat_ids, NULL);
		plat_id = plat_ids[0];
		free(plat_ids);
	}

	ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, 0, NULL, &dev_num);
	if (dev_num == 0) {
		LOGD("MU1: No GPU device available.\n");
		LOGD("MU1: Choose CPU as default device.\n");
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, 0, NULL, &dev_num);
		devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, dev_num, devices, NULL);
	} else {
		LOGD("MU1: Choose GPU as default device. dev_num %d\n", dev_num);
		devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, dev_num, devices, NULL);
	}
	
	context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);

	commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
    
	char filename[] = "/data/mu1_kernel.cl";
	char file_context[10*1024]={0};
	const char *source = &file_context[0];

    ret = read_cl(filename, &file_context[0]);

	size_t sourceSize[10] = {strlen(source)};
	cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize[0], NULL);
	
	ret = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if(ret < 0) {
        LOGD("MU1 Error: clBuildProgram error\n");
        return 0;
    }

	kernel = clCreateKernel(program, "process_iq", NULL);

	inputBuffer_i = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_i[0][0]), NULL);
	inputBuffer_q = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_q[0][0]), NULL);
	inputBuffer_o = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_o[0][0]), NULL);


	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer_i);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer_q);
	ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&inputBuffer_o);



    if(devices != NULL) { free(devices);}

    LOGD("MU1: init cl plat success");
    return 0;
}