Example #1
2
int main(int argc, char** argv)
{
  int err;                            // error code returned from api calls
  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel

  size_t global[2];                   // global domain size for our calculation
  size_t local[2];                    // local domain size for our calculation

  char cl_platform_vendor[1001];
  char cl_platform_name[1001];
   

  cl_mem in_array;                     // device memory used for the input array
  //cl_mem synaptic_weights;             // device memory used for the input array
  cl_mem out_array;                    // device memory used for the output array
   
  if (argc != 2){
    printf("%s <inputfile>\n", argv[0]);
    return -1;
  }

	//float in_array[NO_NODES];
	//float out_array[NO_NODES];
	//float synaptic_weights[NO_NODES*NO_NODES];
	float in_array_tb[NO_NODES];
	float out_array_tb[NO_NODES];
	//float synaptic_weights_tb[NO_NODES*NO_NODES];
	float temp =0;
	int i = 0;
    	int j = 0;
	int index = 0;
	FILE* ifp;
	char* mode = "r";
  //
  // Connect to first platform
  //
  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return -1;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);
 
  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return -1;
  }
  
  //
  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return -1;
  }

  //relu_1(in_array,synaptic_weights,out_array);


  // Fill our data sets with pattern
  //
  //int i = 0;
  //for(i = 0; i < DATA_SIZE; i++) {
  //  a[i] = (int)i;
  //  b[i] = (int)i;
  //  results[i] = 0;
  //}
  //
  
  
  // Create a command commands
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return -1;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char *xclbin=argv[1];
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return -1;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    printf("err : %d %s\n",err,err);
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return -1;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "relu_1", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel!\n");
    printf("Test failed\n");
    return -1;
  }

  // Create the input and output arrays in device memory for our calculation
  //
  in_array = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES, NULL, NULL);
  //synaptic_weights = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES * NO_NODES, NULL, NULL);
  out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL);
  if (!in_array || /*!synaptic_weights ||*/ !out_array)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return -1;
  }    
    
	ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode);

	if(ifp == NULL)
	{
		printf("Input file not found \n");
  		return -1;
	}
	while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) {

		in_array_tb[index++] = temp;
	}
	index = 0;
	temp = 0;

	//ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode);
	//if(ifp == NULL)
	//{
	//	printf("Weight file not found \n");
  	//	return -1;
	//}
	//while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) {
	//	synaptic_weights_tb[index++] = temp;
	//}
   
  //
  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array a!\n");
    printf("Test failed\n");
    return -1;
  }

  // Write our data set into the input array in device memory 
  //
  //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) *  NO_NODES *  NO_NODES, synaptic_weights_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array b!\n");
    printf("Test failed\n");
    return -1;
  }
    
  // Set the arguments to our compute kernel
  //
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array);
  //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);

  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  clWaitForEvents(1, &readevent);
    
  //printf("A\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",a[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("B\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",b[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("res\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
    
  // Validate our results
  //
  //correct = 0;
  //for(i = 0; i < DATA_SIZE; i++)
  //{
  //  int row = i/MATRIX_RANK;
  //  int col = i%MATRIX_RANK;
  //  int running = 0;
  //  int index;
  //  for (index=0;index<MATRIX_RANK;index++) {
  //    int aIndex = row*MATRIX_RANK + index;
  //    int bIndex = col + index*MATRIX_RANK;
  //    running += a[aIndex] * b[bIndex];
  //  }
  //  sw_results[i] = running;
  //}
  //  
  //for (i = 0;i < DATA_SIZE; i++) 
  //  if(results[i] == sw_results[i])
  //    correct++;
  //printf("Software\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  //printf("%0.2f ",sw_results[i]);
  //  printf("%d ",sw_results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //  
  //  
  //// Print a brief summary detailing the results
  ////
  //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE);
  //  
        
  // Shutdown and cleanup
	int temp_ = 0;


 for (j = 0; j < NO_NODES; j++)
 {
 	if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0)
 	{
 		//printf("out_array[%d] = %f \n", j, out_array[j]);
 		temp_++;
 	}
 }


  clReleaseMemObject(in_array);
  //clReleaseMemObject(synaptic_weights);
  clReleaseMemObject(out_array);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

	if (temp_ == NO_NODES)
	{
		printf("*********************************************************** \n");
		printf("TEST PASSED !!!!!! The output matches the desired output. \n");
		printf("*********************************************************** \n");
		return EXIT_SUCCESS;
	}
	else
	{
		printf("**************************************************************** \n");
		printf("TEST Failed !!!!!! The output does not match the desired output. \n");
		printf("**************************************************************** \n");
		return -1;
	}

  //if(correct == DATA_SIZE){
  //  printf("Test passed!\n");
  //  return EXIT_SUCCESS;
  //}
  //else{
  //  printf("Test failed\n");
  //  return -1;
  //}
}
Example #2
0
int main() {
	char buf[]="Hello, World!";
	size_t srcsize, worksize=strlen(buf);
	
	cl_int error;
	cl_platform_id platform;
	cl_device_id device;
	cl_uint platforms, devices;

	// Fetch the Platform and Device IDs; we only want one.
	error=clGetPlatformIDs(1, &platform, &platforms);
	error=clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &devices);
	cl_context_properties properties[]={
		CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
		0};
	// Note that nVidia's OpenCL requires the platform property
	cl_context context=clCreateContext(properties, 1, &device, NULL, NULL, &error);
	cl_command_queue cq = clCreateCommandQueue(context, device, 0, &error);
	
	rot13(buf);	// scramble using the CPU
	puts(buf);	// Just to demonstrate the plaintext is destroyed

	//char src[8192];
	//FILE *fil=fopen("rot13.cl","r");
	//srcsize=fread(src, sizeof src, 1, fil);
	//fclose(fil);
	
	const char *src=rot13_cl;
	srcsize=strlen(rot13_cl);

	const char *srcptr[]={src};
	// Submit the source code of the rot13 kernel to OpenCL
	cl_program prog=clCreateProgramWithSource(context,
		1, srcptr, &srcsize, &error);
	// and compile it (after this we could extract the compiled version)
	error=clBuildProgram(prog, 0, NULL, "", NULL, NULL);

	// Allocate memory for the kernel to work with
	cl_mem mem1, mem2;
	mem1=clCreateBuffer(context, CL_MEM_READ_ONLY, worksize, NULL, &error);
	mem2=clCreateBuffer(context, CL_MEM_WRITE_ONLY, worksize, NULL, &error);
	
	// get a handle and map parameters for the kernel
	cl_kernel k_rot13=clCreateKernel(prog, "rot13", &error);
	clSetKernelArg(k_rot13, 0, sizeof(mem1), &mem1);
	clSetKernelArg(k_rot13, 1, sizeof(mem2), &mem2);

	// Target buffer just so we show we got the data from OpenCL
	char buf2[sizeof buf];
	buf2[0]='?';
	buf2[worksize]=0;

	// Send input data to OpenCL (async, don't alter the buffer!)
	error=clEnqueueWriteBuffer(cq, mem1, CL_FALSE, 0, worksize, buf, 0, NULL, NULL);
	// Perform the operation
	error=clEnqueueNDRangeKernel(cq, k_rot13, 1, NULL, &worksize, &worksize, 0, NULL, NULL);
	// Read the result back into buf2
	error=clEnqueueReadBuffer(cq, mem2, CL_FALSE, 0, worksize, buf2, 0, NULL, NULL);
	// Await completion of all the above
	error=clFinish(cq);
	
	// Finally, output out happy message.
	puts(buf2);
}
Example #3
0
void sum_gpu(long long *in, long long *out, unsigned int n)
{
	size_t global_size;
	size_t local_size;

	char *kernel_src;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	cl_uint max_compute_units;
	size_t max_workgroup_size;

	cl_context context;
	cl_command_queue commands;
	cl_program program;
	cl_kernel kernel;
	cl_mem d_array;

	cl_event event;
	cl_ulong start, end;

	/* start OpenCL */
	err = clGetPlatformIDs(1, &platform_id,NULL);
	clErrorHandling("clGetPlatformIDs");

	err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	clErrorHandling("clGetDeviceIDs");

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	clErrorHandling("clCreateContext");

	commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	clErrorHandling("clCreateCommandQueue");

	/* create kernel */
	kernel_src = file_to_string(KERNEL_SRC);
	program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err);
	free(kernel_src);
	clErrorHandling("clCreateProgramWithSource");

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	clErrorHandling("clBuildProgram");

	kernel = clCreateKernel(program, "matrix_mult", &err);
	clErrorHandling("clCreateKernel");

	/* allocate memory and send to gpu */
	d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err);
	clErrorHandling("clCreateBuffer");

	err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL);
	clErrorHandling("clEnqueueWriteBuffer");

	err  = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
	err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL);
	clErrorHandling("clGetDeviceInfo");

	/* prepare kernel args */
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array);
	err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n);

	/* execute */
	local_size = n / max_compute_units / 8;
	if (local_size > max_workgroup_size)
		local_size = max_workgroup_size;

	/*
	 *	Usually it would be
	 *	global_size = local_size * max_compute_units;
	 *	but that would only be valid if local_size = n / max_compute_units;
	 *	local_size is n / max_compute_units / 8 because it obtains its hightest performance.
	 */
	for (global_size = local_size; global_size < n; global_size += local_size);

	err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event);
	clErrorHandling("clEnqueueNDRangeKernel");

	clWaitForEvents(1, &event);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
	fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0);

	err = clFinish(commands);
	clErrorHandling("clFinish");

	/* transfer back */
	err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long
	clErrorHandling("clEnqueueReadBuffer");

	/* cleanup*/
	clReleaseMemObject(d_array);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);
	clReleaseEvent(event);
}
Example #4
0
int main() {
    // Get platform information
    err = clGetPlatformIDs(0, NULL, &numOfPlatforms);
    if (err) Error("Fail to get the number of platforms");
    printf("The machine has %d platform(s) for OpenCL.\n", numOfPlatforms);

    platformIDs = new cl_platform_id [numOfPlatforms];
    err = clGetPlatformIDs(numOfPlatforms, platformIDs, NULL);
    if (err) Error("Fail to get the platform list");

    int cudaPlatformID = -1;

    for (int i = 0; i < numOfPlatforms; i++) {
        char platformName[50];
        err = clGetPlatformInfo(platformIDs[i], CL_PLATFORM_NAME, 50, platformName, NULL);
        if (err) Error("Fail to get the platform name");
        printf("Platform %d is %s\n", i + 1, platformName);
        if (!strcmp(platformName, "NVIDIA CUDA")) cudaPlatformID = i;
    }
    printf("\n");

    if (cudaPlatformID == -1) Error("Fail to find an NVIDIA CUDA platform");

    printf("Platform %d (NVIDIA CUDA) is chosen for use.\n", cudaPlatformID + 1);
    printf("\n");

    // Get device information
    err = clGetDeviceIDs(platformIDs[cudaPlatformID], CL_DEVICE_TYPE_GPU, 0, NULL, &numOfDevices);
    if (err) Error("Fail to get the number of devices");
    printf("CUDA platform has %d device(s).\n", numOfDevices);

    deviceIDs = new cl_device_id [numOfDevices];
    err = clGetDeviceIDs(platformIDs[cudaPlatformID], CL_DEVICE_TYPE_GPU, numOfDevices, deviceIDs, NULL);
    if (err) Error("Fail to get the device list");
    for (int i = 0; i < numOfDevices; i++) {
        char deviceName[50];
        err = clGetDeviceInfo(deviceIDs[i], CL_DEVICE_NAME, 50, deviceName, NULL);
        if (err) Error("Fail to get the device name");
        printf("Device %d is %s\n", i + 1, deviceName);
    }
    printf("\n");

    // Create a context
    context = clCreateContext(NULL, numOfDevices, deviceIDs, NULL, NULL, &err);
    if (err) Error("Fail to create a context");

    printf("Device 1 is chosen for use.\n");
    printf("\n");

    // Create a command queue for the first device
    commandQueue = clCreateCommandQueue(context, deviceIDs[0],
                                        CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &err);
    if (err) Error("Fail to create a command queue");

    // create the program
    cl_program program = CreateProgram(exclusiveScanKernels, "exclusive scan");

    // create two kernels
    cl_kernel scanKernel = clCreateKernel(program, "Scan", &err);
    if (err) Error("Fail to create the kernel for scan");

    cl_kernel reverseUpdateKernel = clCreateKernel(program, "ReverseUpdate", &err);
    if (err) Error("Fail to create the kernel for reverse update");

    // Get the work group size
    size_t maxWorkGroupSize;
    err = clGetKernelWorkGroupInfo(scanKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE,
                                   sizeof(size_t), &maxWorkGroupSize, NULL);
    printf("maxWorkGroupSize = %d\n", maxWorkGroupSize);

    err = clGetKernelWorkGroupInfo(reverseUpdateKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE,
                                   sizeof(size_t), &maxWorkGroupSize, NULL);
    printf("maxWorkGroupSize = %d\n", maxWorkGroupSize);

    // Set work group size to 64

    int workGroupSize = 512;

    int length = 2048000;
    int *arr = new int [length];
    for (int i = 0; i < length; i++)
        arr[i] = rand() % 100;

    int *prefixSum = new int [length];
    prefixSum[0] = 0;

    int t0 = clock();

    for (int i = 1; i < length; i++)
        prefixSum[i] = prefixSum[i - 1] + arr[i - 1];

    int t1 = clock();

    printf("time1: %lf\n", (t1 - t0) * 1.0 / CLOCKS_PER_SEC);

    cl_mem d_arr = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * length, NULL, &err);
    if (err) Error("Fail to create d_arr");

    err = clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, arr, 0, NULL, NULL);
    if (err) Error("Fail to write d_arr");

    clSetKernelArg(scanKernel, 0, sizeof(cl_mem), &d_arr);
    cl_int d_length = length;
    clSetKernelArg(scanKernel, 1, sizeof(cl_int), &d_length);
    cl_int d_step = 1;
    clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step);
    clSetKernelArg(scanKernel, 3, sizeof(cl_int) * (workGroupSize * 2 + workGroupSize * 2 / 16 + 1), NULL);

    int problemSize = length;
    int records[10];
    int num = 0;

    int t2 = clock();

    for (; problemSize > 1; problemSize = (problemSize - 1) / (workGroupSize * 2) + 1) {

        if (num) d_step *= workGroupSize * 2;

        printf("d_step = %d\n", d_step);

        records[num++] = problemSize;

        printf("problemSize = %d\n", problemSize);

        clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step);

        size_t globalWorkSize = ((problemSize - 1) / (workGroupSize * 2) + 1) * workGroupSize;
        size_t localWorkSize = workGroupSize;

        err = clEnqueueNDRangeKernel(commandQueue, scanKernel, 1, NULL, &globalWorkSize, &localWorkSize,
                                     0, NULL, NULL);
        if (err) Error("Fail to enqueue scan");
        clFinish(commandQueue);
    }

    //CheckValues(length, d_arr);

    int zero = 0;
    clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL);

    printf("d_step = %d\n", d_step);

    //scanf("%*c");

    clSetKernelArg(reverseUpdateKernel, 0, sizeof(cl_mem), &d_arr);
    clSetKernelArg(reverseUpdateKernel, 1, sizeof(cl_int), &d_length);

    for (int i = num - 1; i >= 0; i--, d_step /= workGroupSize * 2) {
        printf("d_step = %d\n", d_step);

        clSetKernelArg(reverseUpdateKernel, 2, sizeof(cl_int), &d_step);
        size_t globalWorkSize = ((records[i] - 1) / (workGroupSize * 2) + 1) * workGroupSize;
        size_t localWorkSize = workGroupSize;

        printf("globalWorkSize = %d, localWorkSize = %d\n", globalWorkSize, localWorkSize);

        err = clEnqueueNDRangeKernel(commandQueue, reverseUpdateKernel, 1, NULL, &globalWorkSize, &localWorkSize,
                                     0, NULL, NULL);
        if (err) Error("Fail to enqueue scan");
        clFinish(commandQueue);
    }

    int t3 = clock();

    printf("time: %lf\n", (t3 - t2) * 1.0 / CLOCKS_PER_SEC);

    int *GPUResult = new int [length];
    memset(GPUResult, 0, sizeof(int) * length);
    err = clEnqueueReadBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, GPUResult, 0, NULL, NULL);
    printf("err = %d\n", err);
    if (err) Error("Fail to read d_arr");

    for (int i = 0; i < length; i++)
        if (GPUResult[i] != prefixSum[i]) printf("at i = %d, GPUResult[%d] = %d, prefixSum[%d] = %d\n", i, i, GPUResult[i], i, prefixSum[i]);

    system("pause");
    return 0;
}
Example #5
0
int device_check()
{

    
    cl_int err;
    cl_int i,j,cnt;
    
    cl_platform_id *platforms;
    cl_uint num_platforms;
    cl_platform_id platform;

    char* ext_data;
    size_t ext_size;
        
    cl_device_id *devs;
    size_t num_devs;
    cl_device_id device;
    
    /* Program data structures */
    cl_program program;
    FILE *program_handle;
    char *program_buffer[NUM_FILES];
    char *program_log;
    const char *file_name[] = {PROGRAM_FILE_1, PROGRAM_FILE_2};
    const char options[] = "-cl-finite-math-only -cl-no-signed-zeros";  
    size_t program_size[NUM_FILES];
    size_t log_size;
    
    /*kernel data*/
    cl_kernel *kernels;
    cl_uint num_kernels;
    

    /*枚举所有的平台,最多10个*/
    err = clGetPlatformIDs(10, NULL, &num_platforms);  /*参数1:要枚举的数量,参数2:返回结果的存放空间,参数3:返回结果的条数*/
    if(err < 0) {
        perror("Couldn't find any platforms");
        exit(1);
    }
    platforms=(cl_platform_id *)malloc( sizeof(cl_platform_id) * num_platforms );
    clGetPlatformIDs(num_platforms, platforms, NULL);

    /*现在num_platforms和platforms是平台的数量和数据指针*/
    /* Find infor of all platforms */
    for (i=0; i<num_platforms; i++)
    {
        /* Find size of extension data */
        /*clGetPlatformInfo*/
        /*
        参数1:平台
        参数2:所需信息的枚举
        参数3:返回值需要保存的长度
        参数4:返回值的存储空间
        参数5:所需数据的真实长度
        */

        
        platform = platforms[i]

        /*NAME*/
        err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &ext_size);  
        if(err < 0) {
            perror("Couldn't read CL_PLATFORM_NAME data.");
        }
        ext_data = (char *)malloc(ext_size);
        clGetPlatformInfo(platform, CL_PLATFORM_NAME, ext_size, ext_data, NULL);
        printf("Platform %d name: %s\n", i, ext_data);
        free(ext_data);

        /*VRNDOR*/
        err = clGetPlatformInfo(platform, CL_PLATFORM_VENDER, 0, NULL, &ext_size);
        if(err < 0) {
            perror("Couldn't read CL_PLATFORM_VENDER data.");
        }
        ext_data = (char *)malloc(ext_size);
        clGetPlatformInfo(platform, CL_PLATFORM_VENDER, ext_size, ext_data, NULL);
        printf("Platform %d vender: %s\n", i, ext_data);
        free(ext_data)

        /*VERSION*/
        err = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &ext_size);
        if(err < 0) {
            perror("Couldn't read CL_PLATFORM_VERSION data.");
        }
        ext_data = (char *)malloc(ext_size);
        clGetPlatformInfo(platform, CL_PLATFORM_VERSION, ext_size, ext_data, NULL);
        printf("Platform %d support OpenCL version: %s\n", i, ext_data);
        free(ext_data)
        
        /*PROFILE*/
        err = clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, 0, NULL, &ext_size);
        if(err < 0) {
            perror("Couldn't read CL_PLATFORM_PROFILE data.");
        }
        ext_data = (char *)malloc(ext_size);
        clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, ext_size, ext_data, NULL);
        printf("Platform %d support OpenCL profile: %s\n", i, ext_data);
        free(ext_data)
        
        /*EXTENSIONS*/
        err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, 0, NULL, &ext_size);
        if(err < 0) {
            perror("Couldn't read CL_PLATFORM_EXTENSIONS data.");
        }
        ext_data = (char *)malloc(ext_size);
        clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, ext_size, ext_data, NULL);
        printf("Platform %d support OpenCL extensions: %s\n", i, ext_data);
        free(ext_data)
        
        /*现在对这个platform进一步的提取信息*/
        /*获取Device信息*/
        /*clGetDeviceIDs*/
        /*
        参数1:平台句柄
        参数2:要获取设备的类型
        参数3:要获取的数量
        参数4:返回信息的数据指针
        参数5:返回信息的实际条数
        */
        for (cnt=0; cnt<2; cnt++)
        {
            if (0==cnt) err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devs);
            if (1==cnt) err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &num_devs);
            if(err == CL_DEVICE_NOT_FOUND)
            {
                if (0==cnt) printf("No GPU support OpenCL found.\n");
                if (1==cnt) printf("No CPU support OpenCL found.\n");
            }
            else if(err < 0)
            {
                if (0==cnt) printf("Couldn't access any GPU devices.\n");
                if (1==cnt) printf("Couldn't access any CPU devices.\n");
            }
            else
            {
                devs = (cl_device_id *)malloc( sizeof(cl_device_id) * num_devs );
                clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devs, devs, NULL);
                for (j=0; j<num_devs; j++)
                {
                    cl_device_id dev;
                    char dev_name_data[48];
                    cl_uint addr_data;
                    cl_ulong global_mem_size;
                    
                    /*name*/
                    err = clGetDeviceInfo(dev, CL_DEVICE_NAME, 48 * sizeof(char), dev_name_data, NULL);
                    if(err < 0) {
                        perror("Couldn't read dev name data");
                        exit(1);
                    }
                    printf("Dev %d: NAME: %s\n", j, name_data);
                                    
                    /*address size*/
                    err = clGetDeviceInfo(dev, CL_DEVICE_ADDRESS_BITS, sizeof(addr_data), &addr_data, NULL);
                    printf("Dev %d: ADDRESS_WIDTH: %u\n", j, addr_data);
                    
                    /*device extensions*/
                    ext_data=malloc(4096)
                    clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, 4096 * sizeof(char), ext_data, NULL);
                    printf("Dev %d: EXTENSIONS: %s\n", j, ext_data);
                    free(ext_data)
                    
                    err = clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(global_mem_size), &global_mem_size, NULL);
                    printf("Dev %d: GLOBAL_MEM_SIZE: %u\n", j, global_mem_size);
                }
            }

        }
    }

    return 0;
}
Example #6
0
void InitOpenCL()
{
    // 1. Get a platform.
    cl_platform_id platform;
    
    clGetPlatformIDs( 1, &platform, NULL );
    // 2. Find a gpu device.
    cl_device_id device;
    
    clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU,
                   1,
                   &device,
                   NULL);
    // 3. Create a context and command queue on that device.
    cl_context context = clCreateContext( NULL,
                                         1,
                                         &device,
                                         NULL, NULL, NULL);
    queue = clCreateCommandQueue( context,
                                 device,
                                 0, NULL );
    // 4. Perform runtime source compilation, and obtain kernel entry point.
    std::ifstream file("scene.cl");
    std::string source;
    if (file){
    while(!file.eof()){
        char line[256];
        file.getline(line,255);
        source += std::string(line) + "\n";
    }
    }
    if (source.length()==0)
    {
        std::string err = "fail to load shader";
    }
    
    cl_ulong maxSize;
    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), &maxSize, 0);
    
    const char* str = source.c_str();
    cl_program program = clCreateProgramWithSource( context,
                                                   1,
                                                   &str,
                                                   NULL, NULL );
    cl_int result = clBuildProgram( program, 1, &device, NULL, NULL, NULL );
    if ( result ){
        char* build_log;
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        build_log = new char[log_size+1];
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
        build_log[log_size] = '\0';
        if( log_size > 2 ) {
            std::cout << "build log: " << build_log << std::endl;
        }
        delete[] build_log;
        std::cout << "Error during compilation! (" << result << ")" << std::endl;
    }
    kernel = clCreateKernel( program, "tracekernel", NULL );
    // 5. Create a data buffer.
    buffer        = clCreateBuffer( context,
                                   CL_MEM_WRITE_ONLY,
                                   kWidth * kHeight *sizeof(cl_float4),
                                   NULL, 0 );
    viewTransform = clCreateBuffer( context,
                                   CL_MEM_READ_WRITE,
                                   16 *sizeof(cl_float),
                                   NULL, 0 );
    
    worldTransforms = clCreateBuffer( context,
                                     CL_MEM_READ_WRITE,
                                     16 *sizeof(cl_float)*2,
                                     NULL, 0 );
    
    clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer);
    clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &kWidth);
    clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*) &kWidth);
    clSetKernelArg(kernel, 3, sizeof(viewTransform), (void*) &viewTransform);
    clSetKernelArg(kernel, 4, sizeof(worldTransforms), (void*) &worldTransforms);
}
Example #7
0
int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem memobj = NULL;
    cl_program program = NULL;
    cl_kernel kernel = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    float mem[MEM_SIZE];

    FILE *fp;
    char fileName[] = "./kernel.clbin";
    size_t binary_size;
    char *binary_buf;
    cl_int binary_status;
    cl_int i;

    /* カーネルを含むオブジェクトファイルをロード */
    fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    binary_buf = (char *)malloc(MAX_BINARY_SIZE);
    binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp );
    fclose( fp );

    /* データを初期化 */
    for( i = 0; i < MEM_SIZE; i++ ) {
        mem[i] = i;
    }

    /* プラットフォーム・デバイスの情報の取得 */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* OpenCLコンテキストの作成 */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
  
    /* コマンドキューの作成 */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
  
    /* メモリバッファの作成 */
    memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret);

    /* メモリバッファにデータを転送 */
    ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

    /* 読み込んだバイナリからカーネルプログラムを作成 */
    program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, 
                                        (const unsigned char **)&binary_buf, &binary_status, &ret);
    
    /* OpenCLカーネルの作成 */
    kernel = clCreateKernel(program, "vecAdd", &ret);
    printf("err:%d\n", ret);

    /* OpenCLカーネル引数の設定 */
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);

    size_t global_work_size[3] = {MEM_SIZE, 0, 0};
    size_t local_work_size[3]  = {MEM_SIZE, 0, 0};

    /* OpenCLカーネルを実行 */
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);

    /* メモリバッファから結果を取得 */
    ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

    /* 結果の表示 */
    for(i=0; i<MEM_SIZE; i++) {
        printf("mem[%d] : %f\n", i, mem[i]);
    }
  
    /* 終了処理 */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    free(binary_buf);

    return 0;
}
Example #8
0
int main(void) {
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // Generate the input array on the host.
    float h_a[ARRAY_SIZE];
    float h_b[ARRAY_SIZE];
    for (int i = 0; i < ARRAY_SIZE; i++) {
        h_a[i] = (float)i;
        h_b[i] = (float)(2 * i);
    }

    float h_c[ARRAY_SIZE];

    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("vectors_cl.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char *)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
                         &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      ARRAY_BYTES, NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      ARRAY_BYTES, NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                      ARRAY_BYTES, NULL, &ret);

    // Copy h_a and h_b to memory buffer
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
                               ARRAY_BYTES, h_a, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
                               ARRAY_BYTES, h_b, 0, NULL, NULL);

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
        (const char **)&source_str, (const size_t *)&source_size, &ret);
    if (ret != 0) {
        printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret);
        exit(1);
    }

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret != 0) {
        printf("clBuildProgram returned non-zero status %d: ", ret);

        if (ret == CL_INVALID_PROGRAM) {
            printf("invalid program\n");
        } else if (ret == CL_INVALID_VALUE) {
            printf("invalid value\n");
        } else if (ret == CL_INVALID_DEVICE) {
            printf("invalid device\n");
        } else if (ret == CL_INVALID_BINARY) {
            printf("invalid binary\n");
        } else if (ret == CL_INVALID_BUILD_OPTIONS) {
            printf("invalid build options\n");
        } else if (ret == CL_INVALID_OPERATION) {
            printf("invalid operation\n");
        } else if (ret == CL_COMPILER_NOT_AVAILABLE) {
            printf("compiler not available\n");
        } else if (ret == CL_BUILD_PROGRAM_FAILURE) {
            printf("build program failure\n");

            // Determine the size of the log
            size_t log_size;
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

            // Allocate memory for the log
            char *log = (char *) malloc(log_size);

            // Get the log
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

            // Print the log
            printf("%s\n", log);
        } else if (ret == CL_OUT_OF_HOST_MEMORY) {
            printf("out of host memory\n");
        }
        exit(1);
    }

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "add", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    size_t array_size = ARRAY_SIZE;
    ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size);

    // Execute the OpenCL kernel on the list
    size_t global_item_size = ARRAY_SIZE; // Process the entire lists
    size_t local_item_size = 1; // Divide work items into groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
                              ARRAY_BYTES, h_c, 0, NULL, NULL);

    // Print out the resulting array.
    for (int i = 0; i < 8; i++) {
        printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]);
        printf(((i % 4) != 3) ? "\t" : "\n");
    }

    printf("...\n");

    for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) {
        printf("%d + %d = %d",
               (int)h_a[i], (int)h_b[i], (int)h_c[i]);
        printf(((i % 4) != 3) ? "\t" : "\n");
    }

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    return 0;
}
Example #9
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufY;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);
	int lenY = 1 + (N-1)*abs(incy);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);

	printResult();

    /* Call clblas function. */
    err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event);
//	printf("here\n");
    if (err != CL_SUCCESS) {
        printf("clblasSrot() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
                                    Y, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
                                    X, 0, NULL, NULL);

        /* At this point you will get the result of SROT placed in vector Y. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Example #10
0
int main(void)
{
  hwloc_topology_t topology;
  cl_int clret;
  cl_platform_id *platform_ids;
  unsigned nrp, nrd, count, i, j;
  int err;

  hwloc_topology_init(&topology);
  hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
  hwloc_topology_load(topology);

  clret = clGetPlatformIDs(0, NULL, &nrp);
  if (CL_SUCCESS != clret || !nrp)
    return 0;
  platform_ids = malloc(nrp * sizeof(*platform_ids));
  if (!platform_ids)
    return 0;
  clret = clGetPlatformIDs(nrp, platform_ids, &nrp);
  if (CL_SUCCESS != clret || !nrp)
    return 0;

  count = 0;
  for(i=0; i<nrp; i++) {
    cl_device_id *device_ids;

    clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nrd);
    if (CL_SUCCESS != clret || !nrd)
      continue;
    device_ids = malloc(nrd * sizeof(*device_ids));
    if (!device_ids)
      continue;
    clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, nrd, device_ids, &nrd);
    if (CL_SUCCESS != clret || !nrd)
      continue;

    for(j=0; j<nrd; j++) {
      hwloc_bitmap_t set;
      hwloc_obj_t osdev, osdev2, ancestor;
      const char *value;

      osdev = hwloc_opencl_get_device_osdev(topology, device_ids[j]);
      osdev2 = hwloc_opencl_get_device_osdev_by_index(topology, i, j);
      assert(osdev == osdev2);
      if (!osdev) {
	printf("no osdev for platform %d device %d\n", i, j);
	continue;
      }

      ancestor = hwloc_get_non_io_ancestor_obj(topology, osdev);

      set = hwloc_bitmap_alloc();
      err = hwloc_opencl_get_device_cpuset(topology, device_ids[j], set);
      if (err < 0) {
	printf("no cpuset for platform %d device %d\n", i, j);
      } else {
	char *cpuset_string = NULL;
	hwloc_bitmap_asprintf(&cpuset_string, set);
	printf("got cpuset %s for platform %d device %d\n", cpuset_string, i, j);
	free(cpuset_string);
	assert(hwloc_bitmap_isequal(set, ancestor->cpuset));
      }
      hwloc_bitmap_free(set);

      printf("found OSDev %s\n", osdev->name);
      err = strncmp(osdev->name, "opencl", 6);
      assert(!err);
      assert(atoi(osdev->name+6) == (int) count);

      value = hwloc_obj_get_info_by_name(osdev, "Backend");
      err = strcmp(value, "OpenCL");
      assert(!err);

      value = hwloc_obj_get_info_by_name(osdev, "Name");
      printf("found OSDev name %s\n", value);

      count++;
    }
  }

  hwloc_topology_destroy(topology);

  return 0;
}
Example #11
0
int main() {

   // Set the image rotation (in degrees)
   float theta = 3.14159/6;
   float cos_theta = cosf(theta);
   float sin_theta = sinf(theta);
   printf("theta = %f (cos theta = %f, sin theta = %f)\n", theta, cos_theta, 
      sin_theta);

   // Rows and columns in the input image
   int imageHeight;
   int imageWidth;

   const char* inputFile = "input.bmp";
   const char* outputFile = "output.bmp";

   // Homegrown function to read a BMP from file
   float* inputImage = readImage(inputFile, &imageWidth,
      &imageHeight);

   // Size of the input and output images on the host
   int dataSize = imageHeight*imageWidth*sizeof(float);

   // Output image on the host
   float* outputImage = NULL;
   outputImage = (float*)malloc(dataSize);

   // Set up the OpenCL environment
   cl_int status;

   // Discovery platform
   cl_platform_id platforms[2];
   cl_platform_id platform;
   status = clGetPlatformIDs(2, platforms, NULL);
   chk(status, "clGetPlatformIDs");
   platform = platforms[PLATFORM_TO_USE];

   // Discover device
   cl_device_id device;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
   chk(status, "clGetDeviceIDs");

   // Create context
   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
       (cl_context_properties)(platform), 0};
   cl_context context;
   context = clCreateContext(props, 1, &device, NULL, NULL, &status);
   chk(status, "clCreateContext");

   // Create command queue
   cl_command_queue queue;
   queue = clCreateCommandQueue(context, device, 0, &status);
   chk(status, "clCreateCommandQueue");

   // Create the input and output buffers
   cl_mem d_input;
   d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL,
       &status);
   chk(status, "clCreateBuffer");

   cl_mem d_output;
   d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL,
       &status);
   chk(status, "clCreateBuffer");

   // Copy the input image to the device
   status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize, 
         inputImage, 0, NULL, NULL);
   chk(status, "clEnqueueWriteBuffer");

   const char* source = readSource("rotation.cl");

   // Create a program object with source and build it
   cl_program program;
   program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
   chk(status, "clCreateProgramWithSource");
   status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   chk(status, "clBuildProgram");
   
   // Create the kernel object
   cl_kernel kernel;
   kernel = clCreateKernel(program, "img_rotate", &status);
   chk(status, "clCreateKernel");

   // Set the kernel arguments
   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_output);
   status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_input);
   status |= clSetKernelArg(kernel, 2, sizeof(int), &imageWidth);
   status |= clSetKernelArg(kernel, 3, sizeof(int), &imageHeight);
   status |= clSetKernelArg(kernel, 4, sizeof(float), &sin_theta);
   status |= clSetKernelArg(kernel, 5, sizeof(float), &cos_theta);
   chk(status, "clSetKernelArg");

   // Set the work item dimensions
   size_t globalSize[2] = {imageWidth, imageHeight};
   status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
      NULL, NULL);
   chk(status, "clEnqueueNDRange");

   // Read the image back to the host
   status = clEnqueueReadBuffer(queue, d_output, CL_TRUE, 0, dataSize, 
         outputImage, 0, NULL, NULL); 
   chk(status, "clEnqueueReadBuffer");

   // Write the output image to file
   storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);

   return 0;
}
Example #12
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
    
    float data[DATA_SIZE];              // original data set given to device
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned
    
    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation
    
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    // Fill our data set with random float values
    //
    int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;
    
    // Connect to a compute device
    //
    int gpu = 1;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    
    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }
    
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }
    
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }
    
    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }
    
    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }
    
    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }
    
    // Write our data set into the input array in device memory
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }
    
    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }
    
    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    
    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }
    
    // Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);
    
    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
    
    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        if(results[i] == data[i] * data[i])
            correct++;
    }
    
    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);
    
    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    
    return 0;
}
cl_context CreateContext()
{
	cl_int errNum;
	cl_uint numPlatforms;
	cl_platform_id * platformIDs;
	cl_uint numDevices;
	cl_device_id * deviceIDs;
	cl_context context = NULL;

	// First, select an OpenCL platform to run on. 
	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
	platformIDs = (cl_platform_id *)alloca(
		sizeof(cl_platform_id) * numPlatforms);
	errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
	if (errNum != CL_SUCCESS || numPlatforms <= 0)
	{
		std::cerr << "Failed to find any OpenCL platforms." << std::endl;
		return NULL;
	}
	else std::cout<<"number of platforms:"<<numPlatforms<<std::endl;

	deviceIDs = NULL;
	errNum = clGetDeviceIDs(
		platformIDs[0], 
		CL_DEVICE_TYPE_GPU, //寻找GPU device 的个数
		0,
		NULL,
		&numDevices);
	if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
	{
		checkErr(errNum, "clGetDeviceIDs");
	}
	else std::cout<<"number of devices:"<<numDevices<<std::endl;
	deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
	errNum = clGetDeviceIDs(
		platformIDs[0],
		CL_DEVICE_TYPE_GPU,
		numDevices, 
		&deviceIDs[0], 
		NULL);
	checkErr(errNum, "clGetDeviceIDs");
	// Next, create an OpenCL context on the platform.  Attempt to
	// create a GPU-based context, and if that fails, try to create
	// a CPU-based context.
	cl_context_properties contextProperties[] =
	{
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platformIDs[0],
		0
	};
	/*context = clCreateContext(
		contextProperties, 
		numDevices,
		deviceIDs, 
		NULL,
		NULL, 
		&errNum);
	checkErr(errNum, "clCreateContext");*/
	//如果有平台和设备类型,可以使用clCreateContextFromType()创建上下文。上下文较随意,可以包含不同设备类型
	context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
		NULL, NULL, &errNum);
	if (errNum != CL_SUCCESS)
	{
		std::cout << "Could not create GPU context, trying CPU..." << std::endl;
		context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
			NULL, NULL, &errNum);
		if (errNum != CL_SUCCESS)
		{
			std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
			return NULL;
		}
	}

	return context;
}
Example #14
0
	OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_)
	  : Device(stats)
	{
		background = background_;
		cpPlatform = NULL;
		cxContext = NULL;
		cqCommandQueue = NULL;
		cpProgram = NULL;
		ckPathTraceKernel = NULL;
		ckFilmConvertKernel = NULL;
		null_mem = 0;
		device_initialized = false;

		/* setup platform */
		cl_uint num_platforms;

		ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
		if(opencl_error(ciErr))
			return;

		if(num_platforms == 0) {
			opencl_error("OpenCL: no platforms found.");
			return;
		}

		ciErr = clGetPlatformIDs(1, &cpPlatform, NULL);
		if(opencl_error(ciErr))
			return;

		char name[256];
		clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
		platform_name = name;

		/* get devices */
		vector<cl_device_id> device_ids;
		cl_uint num_devices;

		if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), 0, NULL, &num_devices)))
			return;

		if(info.num > num_devices) {
			if(num_devices == 0)
				opencl_error("OpenCL: no devices found.");
			else
				opencl_error("OpenCL: specified device not found.");
			return;
		}

		device_ids.resize(num_devices);
		
		if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL)))
			return;

		cdDevice = device_ids[info.num];

		/* create context */
		cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr);
		if(opencl_error(ciErr))
			return;

		cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
		if(opencl_error(ciErr))
			return;

		null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
		device_initialized = true;
	}
Example #15
0
int main(int argc, const char** argv) {
  // start logs
  printf("clDeviceQuery Starting...\n\n");
  bool bPassed = true;
  std::string sProfileString = "clDeviceQuery, Platform Name = ";

  // Get OpenCL platform ID for NVIDIA if avaiable, otherwise default
  char cBuffer[1024];
  cl_platform_id clSelectedPlatformID = NULL;
  cl_platform_id* clPlatformIDs;

  cl_uint num_platforms;
  cl_int ciErrNum = clGetPlatformIDs(0, NULL, &num_platforms);
  if (ciErrNum != CL_SUCCESS) {
    printf(" Error %i in clGetPlatformIDs Call!\n\n", ciErrNum);
    bPassed = false;
  } else {
    if (num_platforms == 0) {
      printf("No OpenCL platform found!\n\n");
      bPassed = false;
    } else {
      // if there's one platform or more, make space for ID's
      if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL) {
	printf("Failed to allocate memory for cl_platform ID's!\n\n");
	bPassed = false;
      }

      printf("%d OpenCL Platforms found\n\n", num_platforms);
      // get platform info for each platform
      ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
      for(cl_uint i = 0; i < num_platforms; ++i) {
	ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &cBuffer, NULL);
	if(ciErrNum == CL_SUCCESS) {
	  clSelectedPlatformID = clPlatformIDs[i];
	  // Get OpenCL platform name and version
	  ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
	  if (ciErrNum == CL_SUCCESS) {
	    printf(" CL_PLATFORM_NAME: \t%s\n", cBuffer);
	    sProfileString += cBuffer;
	  } else {
	    printf(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum);
    bPassed = false;
  }
  sProfileString += ", Platform Version = ";

  ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_VERSION, sizeof(cBuffer), cBuffer, NULL);
  if (ciErrNum == CL_SUCCESS) {
    printf(" CL_PLATFORM_VERSION: \t%s\n", cBuffer);
    sProfileString += cBuffer;
  } else {
    printf(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum);
    bPassed = false;
  }

  // Log OpenCL SDK Version # (for convenience:  not specific to OpenCL)
  sProfileString += ", NumDevs = ";

  // Get and log OpenCL device info
  cl_uint ciDeviceCount;
  cl_device_id *devices;
  printf("OpenCL Device Info:\n\n");
  ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount);

  // check for 0 devices found or errors...
  if (ciDeviceCount == 0) {
    printf(" No devices found supporting OpenCL (return code %i)\n\n", ciErrNum);
    bPassed = false;
    sProfileString += "0";
  } else if (ciErrNum != CL_SUCCESS) {
    printf(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
    bPassed = false;
  } else {
    // Get and log the OpenCL device ID's
    ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
    printf(" %u devices found supporting OpenCL on: %s\n\n", ciDeviceCount, cBuffer);
    char cTemp[2];
    sprintf(cTemp, "%u", ciDeviceCount);
    sProfileString += cTemp;
    if ((devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount)) == NULL) {
      printf(" Failed to allocate memory for devices !!!\n\n");
      bPassed = false;
    }
    ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount);
    if (ciErrNum == CL_SUCCESS) {
      for(unsigned int i = 0; i < ciDeviceCount; ++i )  {
        printf(" ----------------------------------\n");
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
printf(" Device %s\n", cBuffer);
printf(" ---------------------------------\n");
clPrintDevInfo(devices[i]);
sProfileString += ", Device = ";
sProfileString += cBuffer;
      }
            } else {
      printf(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
      bPassed = false;
    }
  }

  // masterlog info
  sProfileString += "\n";
  printf("%s", sProfileString.c_str());
}
free(clPlatformIDs);
      }
    }
  }

  // Log system info(for convenience:  not specific to OpenCL)
  printf( "\nSystem Info: \n\n");
  char timestr[255];
  time_t now = time(NULL);
  struct tm  *ts;

  ts = localtime(&now);

  strftime(timestr, 255, " %H:%M:%S, %m/%d/%Y",ts);

  // write time and date to logs
  printf(" Local Time/Date = %s\n", timestr);
  // write proc and OS info to logs
  // parse /proc/cpuinfo
  std::ifstream cpuinfo( "/proc/cpuinfo" ); // open the file in /proc
  std::string tmp;

  int cpu_num = 0;
  std::string cpu_name = "none";
  do {
    cpuinfo >> tmp;

    if( tmp == "processor" )
      cpu_num++;

    if( tmp == "name" ) {
      cpuinfo >> tmp; // skip :

      std::stringstream tmp_stream("");
      do {
	cpuinfo >> tmp;
	if (tmp != std::string("stepping")) {
	  tmp_stream << tmp.c_str() << " ";
	}

      }
      while (tmp != std::string("stepping"));

      cpu_name = tmp_stream.str();
    }
  }
  while ( (! cpuinfo.eof()) );

  // Linux version
  std::ifstream version( "/proc/version" );
  char versionstr[255];

  version.getline(versionstr, 255);

  printf(" CPU Name: %s\n # of CPU processors: %u\n %s\n\n\n",
	 cpu_name.c_str(),cpu_num,versionstr);

  // finish
  printf("TEST %s\n\n", bPassed ? "PASSED" : "FAILED !!!");
}
Example #16
0
int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 100000;
 
    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;
 
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
 
    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);
 
    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);
 
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }
 
    size_t globalSize, localSize;
    cl_int err;
 
    // Number of work items in each local work group
    localSize = 64;
 
    // Number of total work items - localSize must be devisor
    globalSize = ceil(n/(float)localSize)*localSize;
 
    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
 
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 
    // Create a context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 
    // Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);
 
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
 
    // Build the program executable
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);
 
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
 
    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                                   bytes, h_a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                                   bytes, h_b, 0, NULL, NULL);
 
    // Set the arguments to our compute kernel
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
 
    // Execute the kernel over the entire range of the data set 
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);
 
    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);
 
    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );
 
    //Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);
 
    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Example #17
0
/** 
 * @brief Create a new OpenCL zone, which will contain complete information for an OpenCL execution session on a specific device. 
 * 
 * @param deviceType Device type (OpenCL bitfield).
 * @param numQueues Number of command queues.
 * @param queueProperties Properties for the command queues.
 * @param devSel Pointer to function which will select device, if more than one is available.
 * @param dsExtraArg Extra argument for (*deviceSelector) function.
 * @param err Error structure, to be populated if an error occurs.
 * @return OpenCL zone or NULL if device wasn't properly initialized.
 */
CLUZone* clu_zone_new(cl_uint deviceType, cl_uint numQueues, cl_int queueProperties, clu_device_selector devSel, void* dsExtraArg, GError **err) {
	
	/* OpenCL status variable. */
	cl_int status;
	
	/* OpenCL zone to initialize and return */
	CLUZone* zone;
	
	/* Information about devices */
	CLUDeviceInfo devInfos[CLU_MAX_DEVICES_TOTAL];

	/* Number of devices. */
	cl_uint numDevices;

	/* Index of device information */
	cl_int deviceInfoIndex;

	/* Context properties, */
	cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, 0, 0};

	/* List of platform Ids. */
	cl_platform_id platfIds[CLU_MAX_PLATFORMS];

	/* Number of platforms. */
	cl_uint numPlatforms;

	/* Total number of devices. */
	unsigned int totalNumDevices;

	/* Device IDs for a given platform. */
	cl_device_id devIds[CLU_MAX_DEVICES_PER_PLATFORM];
	
	/* Initialize zone */
	zone = (CLUZone*) malloc(sizeof(CLUZone));
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		NULL == zone, 
		CLU_ERROR_NOALLOC, 
		error_handler, 
		"Unable to allocate memory for OpenCL zone"
	);
	zone->context = NULL;
	zone->queues = NULL;
	zone->program = NULL;
	zone->device_info.device_id = NULL;
	zone->device_info.platform_id = NULL;
	zone->device_info.device_name[0] = '\0';
	zone->device_info.device_vendor[0] = '\0';
	zone->device_info.platform_name[0] = '\0';
		
	/* Get number of platforms */
	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != status, 
		CLU_OCL_ERROR, 
		error_handler, 
		"clu_zone_new: get number of platforms (OpenCL error %d: %s).",
		status,
		clerror_get(status));

	/* Get existing platforms */
	status = clGetPlatformIDs(numPlatforms, platfIds, NULL);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != status, 
		CLU_OCL_ERROR, 
		error_handler, 
		"clu_zone_new: get platform Ids (OpenCL error %d: %s).", 
		status,
		clerror_get(status));

	/* Cycle through platforms, get specified devices in existing platforms */
	totalNumDevices = 0;
	for(unsigned int i = 0; i < numPlatforms; i++) 	{
		/* Get specified devices for current platform */
		status = clGetDeviceIDs(
			platfIds[i], 
			deviceType, 
			CLU_MAX_DEVICES_PER_PLATFORM, 
			devIds, 
			&numDevices);
		if (status != CL_DEVICE_NOT_FOUND) {
			/* At least one device found, lets take note */
			gef_if_error_create_goto(
				*err, 
				CLU_UTILS_ERROR, 
				CL_SUCCESS != status, 
				CLU_OCL_ERROR, 
				error_handler, 
				"clu_zone_new: get device Ids (OpenCL error %d: %s).", 
				status,
				clerror_get(status));
			for (unsigned int j = 0; j < numDevices; j++) {
				/* Keep device and platform IDs. */
				devInfos[totalNumDevices].device_id = devIds[j];
				devInfos[totalNumDevices].platform_id = platfIds[i];
				/* Get device name. */
				status = clGetDeviceInfo(
					devIds[j], 
					CL_DEVICE_NAME, 
					sizeof(devInfos[totalNumDevices].device_name), 
					devInfos[totalNumDevices].device_name, 
					NULL);
				gef_if_error_create_goto(
					*err, 
					CLU_UTILS_ERROR, 
					CL_SUCCESS != status, 
					CLU_OCL_ERROR, 
					error_handler, 
					"clu_zone_new: get device name info (OpenCL error %d: %s).",
					status,
					clerror_get(status));
				/* Get device vendor. */
				status = clGetDeviceInfo(
					devIds[j], 
					CL_DEVICE_VENDOR, 
					sizeof(devInfos[totalNumDevices].device_vendor), 
					devInfos[totalNumDevices].device_vendor, 
					NULL);
				gef_if_error_create_goto(
					*err, 
					CLU_UTILS_ERROR, 
					CL_SUCCESS != status, 
					CLU_OCL_ERROR, 
					error_handler, 
					"clu_zone_new: get device vendor info (OpenCL error %d: %s).", 
					status,
					clerror_get(status));
				/* Get platform name. */
				status = clGetPlatformInfo(
					platfIds[i],
					CL_PLATFORM_VENDOR,
					sizeof(devInfos[totalNumDevices].platform_name),
					devInfos[totalNumDevices].platform_name,
					NULL);
				gef_if_error_create_goto(
					*err, 
					CLU_UTILS_ERROR, 
					CL_SUCCESS != status, 
					CLU_OCL_ERROR, 
					error_handler, 
					"clu_zone_new: get platform info (OpenCL error %d: %s).",
					status,
					clerror_get(status));
				/* Increment total number of found devices. */
				totalNumDevices++;
			}
		}
	}
	
	/* Check whether any devices of the specified type were found */
	if (totalNumDevices == 0) {
		/* No devices of the specified type where found, return with error. */
		gef_if_error_create_goto(
			*err, 
			CLU_UTILS_ERROR, 
			1, 
			CLU_ERROR_DEVICE_NOT_FOUND, 
			error_handler, 
			"clu_zone_new: device not found.");
	} else {
		/* Several compatible devices found, choose one with given selector function. */
		deviceInfoIndex = devSel(devInfos, totalNumDevices, dsExtraArg);
		/* Test return value of selector function (if it is out of range, 
		 * there is a programming error). */
		g_assert_cmpint(deviceInfoIndex, >=, -1);
		g_assert_cmpint(deviceInfoIndex, <, totalNumDevices);
		/* If selector function returned -1, then no device is selectable. */
		if (deviceInfoIndex == -1) {
			gef_if_error_create_goto(
				*err, 
				CLU_UTILS_ERROR, 
				1, 
				CLU_ERROR_DEVICE_NOT_FOUND, 
				error_handler, 
				"clu_zone_new: specified device not found.");
		}
	}

	/* Store info about the selected device and platform. */
	zone->device_type = deviceType;
	zone->device_info = devInfos[deviceInfoIndex];

	/* Determine number of compute units for that device */
	status = clGetDeviceInfo(
		zone->device_info.device_id, 
		CL_DEVICE_MAX_COMPUTE_UNITS, 
		sizeof(cl_uint), 
		&zone->cu, 
		NULL);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != status, 
		CLU_OCL_ERROR, 
		error_handler, 
		"clu_zone_new: get target device info (OpenCL error %d: %s).", 
		status,
		clerror_get(status));
	
	/* Create a context on that device. */
	cps[1] = (cl_context_properties) devInfos[deviceInfoIndex].platform_id;
	zone->context = clCreateContext(cps, 1, &zone->device_info.device_id, NULL, NULL, &status);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != status, 
		CLU_OCL_ERROR, 
		error_handler, 
		"clu_zone_new: creating context (OpenCL error %d: %s).", 
		status,
		clerror_get(status));
	
	/* Create the specified command queues on that device */
	zone->numQueues = numQueues;
	zone->queues = (cl_command_queue*) malloc(numQueues * sizeof(cl_command_queue));
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		NULL == zone->queues, 
		CLU_ERROR_NOALLOC, 
		error_handler, 
		"Unable to allocate memory to keep OpenCL command queues in Zone."
	);
	
	for (unsigned int i = 0; i < numQueues; i++) {
		zone->queues[i] = clCreateCommandQueue(
			zone->context, 
			zone->device_info.device_id, 
			queueProperties, 
			&status);
		gef_if_error_create_goto(
			*err, 
			CLU_UTILS_ERROR, 
			CL_SUCCESS != status, 
			CLU_OCL_ERROR, 
			error_handler, 
			"clu_zone_new: creating command queue (OpenCL error %d: %s).", 
			status,
			clerror_get(status));
	}

	/* If we got here, everything is OK. */
	g_assert (err == NULL || *err == NULL);
	goto finish;
	
error_handler:
	/* If we got here there was an error, verify that it is so. */
	g_assert (err == NULL || *err != NULL);
	/* Free OpenCL zone. */
	if (zone != NULL) {
		clu_zone_free(zone);
		zone = NULL;
	}

finish:	

	/* Return. */
	return zone;

}
Example #18
0
void runProgram(int N, char *fileName)
{
	printf("GPU Symmetrize()..."
		"\nSquareMatrix[%d][%d]\n", N, N);

	int i,j;

	// initialize input array
	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = j;	
		}
	}

	//  result
	float *Aout;
	Aout = (float*)malloc(sizeof(float)*N*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);
#endif

	int NumK = 1;
	int NumE = 2;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);



#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	//printf("binary size = %ld\n", binary_size);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL);
	OCL_CHECK(err);

	//puts("save binaries");

	// Print the binary out to the output file
	fh = fopen("kernel.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

	puts("done save binaries");

#endif


	kernel[0] = clCreateKernel(program, "kernel_a", &err);
	OCL_CHECK(err);

	// memory on device
	cl_mem A_d    	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem Aout_d   = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);


	// copy data to device
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , &event[0]); 
	OCL_CHECK(err);

	size_t localsize[2];
	size_t globalsize[2];

	localsize[0] = 16; 
	localsize[1] = 16;

	globalsize[0] = N;
	globalsize[1] = N;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL);
	OCL_CHECK(err);

	clFinish(queue);

	// read device data back to host
	clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]);

	err = clWaitForEvents(1,&event[1]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("Output");
	check_2d_f(Aout,N,N);
#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(Aout_d);	


	// // check
	// int flag = 1;
	// for(i=0;i<N;++i){
	// 	for(j=0;j<N;++j){
	// 		if(A[i*N+j] != At[j*N+i])		
	// 		{
	// 			flag  = 0;
	// 			break;
	// 		}
	// 	}
	// }
	// if( flag == 0 )
	// {
	// 	puts("Bugs! Check program.");
	// }else{
	// 	puts("Succeed!");	
	// }



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif



	free(A);
	free(Aout);

	return;
}
Example #19
0
int main() {
 
    int i, j;
    char* value;
    size_t valueSize;
    cl_uint platformCount;
    cl_platform_id* platforms;
    cl_uint deviceCount;
    cl_device_id* devices;
    cl_uint maxComputeUnits;
 
    // get all platforms
    clGetPlatformIDs(0, NULL, &platformCount);
    platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
    clGetPlatformIDs(platformCount, platforms, NULL);
 
    for (i = 0; i < platformCount; i++) {
 
        // get all devices
        clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
        devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
        clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
 
        // for each device print critical attributes
        for (j = 0; j < deviceCount; j++) {
 
            // print device name
            clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
            value = (char*) malloc(valueSize);
            clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
            printf("%d. Device: %s\n", j+1, value);
            free(value);
 
            // print hardware device version
            clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);
            value = (char*) malloc(valueSize);
            clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
            printf(" %d.%d Hardware version: %s\n", j+1, 1, value);
            free(value);
 
            // print software driver version
            clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);
            value = (char*) malloc(valueSize);
            clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
            printf(" %d.%d Software version: %s\n", j+1, 2, value);
            free(value);
 
            // print c version supported by compiler for device
            clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize);
            value = (char*) malloc(valueSize);
            clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
            printf(" %d.%d OpenCL C version: %s\n", j+1, 3, value);
            free(value);
 
            // print parallel compute units
            clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,
                    sizeof(maxComputeUnits), &maxComputeUnits, NULL);
            printf(" %d.%d Parallel compute units: %d\n", j+1, 4, maxComputeUnits);
 
        }
 
        free(devices);
 
    }
 
    free(platforms);
    exit(0);
 
}
Example #20
0
int main(int argc, char *argv[]){

	if (MODE == 5){

		printf("---OpenCL Test Code---\n\n");


		cl_int errNum;
		cl_uint numPlatforms;
		cl_platform_id *platforms = NULL;
		cl_uint numDevices;
		cl_device_id *devices = NULL;

		//platform info fields
		char vendor[1024], name[1024], version[1024];

		//device info fields
		size_t MAX_WORK_GROUP_SIZE;
		cl_ulong GLOBAL_MEM_CACHE_SIZE, GLOBAL_MEM_SIZE, LOCAL_MEM_SIZE, GLOBAL_MEM_CACHELINE_SIZE;
		cl_uint MAX_COMPUTE_UNITS, MAX_WORK_ITEM_DIMENSIONS;
		size_t MAX_WORK_ITEM_SIZES[3];
		char DEVICE_NAME[1024], DEVICE_VENDOR[1024], DEVICE_VERSION[1024], DRIVER_VERSION[1024], EXTENSIONS[2048];
		cl_device_mem_cache_type GLOBAL_MEM_CACHE_TYPE;


		//printf("Getting number of OpenCL Platforms...\n");
		errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get number of OpenCL platforms.\n");
			return 0;
		}
		else
		{

			//printf("found %d.\n", numPlatforms);
		}

		//printf("Allocating space for the platform info...\n");
		platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id));

		printf("---Platform Info---\n");
		errNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get platform info.\n");
			return 0;
		}
		else
		{
			clGetPlatformInfo (platforms[0], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL);
			clGetPlatformInfo (platforms[0], CL_PLATFORM_NAME, sizeof(name), name, NULL);
			clGetPlatformInfo (platforms[0], CL_PLATFORM_VERSION, sizeof(version), version, NULL);

			//printf("Got platform info.\n");
			printf("Vendor: \t%s\n", vendor);
			printf("Name:   \t%s\n", name);
			printf("Version:\t%s\n", version);
		}

		//printf("Getting number of devices...\n");
		errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get number of devices.\n");
			return 0;
		}
		else
		{
	    	//printf("Found %d.\n", numDevices);
	    }

		//printf("Allocating space for device info...\n");
		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

		printf("\n---Device Info---");
		errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get device info.\n");
			return 0;
		}
		else
		{

			int i, j = 0;
			for (i = 0; i < numDevices; i++ )
			{
				printf("\nDevice ID: %d\n", i+1);
				clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(DEVICE_NAME), DEVICE_NAME, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(DEVICE_VENDOR), DEVICE_VENDOR, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(DEVICE_VERSION), DEVICE_VERSION, NULL);
				clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(DRIVER_VERSION), DRIVER_VERSION, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(EXTENSIONS), EXTENSIONS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS), &MAX_COMPUTE_UNITS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(GLOBAL_MEM_SIZE), &GLOBAL_MEM_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(LOCAL_MEM_SIZE), &LOCAL_MEM_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(MAX_WORK_ITEM_DIMENSIONS), &MAX_WORK_ITEM_DIMENSIONS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MAX_WORK_ITEM_SIZES), MAX_WORK_ITEM_SIZES, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_WORK_GROUP_SIZE), &MAX_WORK_GROUP_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(GLOBAL_MEM_CACHE_SIZE), &GLOBAL_MEM_CACHE_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(GLOBAL_MEM_CACHELINE_SIZE), &GLOBAL_MEM_CACHELINE_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(GLOBAL_MEM_CACHE_TYPE), &GLOBAL_MEM_CACHE_TYPE, NULL);


				printf("Device Name:\t%s\n", DEVICE_NAME);
				printf("Device Vendor:\t%s\n", DEVICE_VENDOR);
				printf("Device Version:\t%s\n", DEVICE_VERSION);
				printf("Driver Version:\t%s\n", DRIVER_VERSION);
				printf("EXTENSIONS:\t%s\n", EXTENSIONS);
				printf("Number of CUs:\t%d\n", MAX_COMPUTE_UNITS);
				printf("GMem:\t\t%lld (Bytes)\n", (long long) GLOBAL_MEM_SIZE);
				printf("GMem $ Size:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHE_SIZE);
				printf("GMem $ Line:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHELINE_SIZE);
				if(GLOBAL_MEM_CACHE_TYPE == CL_NONE)
				{
					printf("GMem $ Type:\tCL_NONE\n");
				}
				else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_ONLY_CACHE)
				{
					printf("GMem $ Type:\tCL_READ_ONLY_CACHE\n");
				}

				else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_WRITE_CACHE)
				{
					printf("GMem $ Type:\tCL_READ_WRITE_CACHE\n");
				}
				printf("LMem:\t\t%lld (Bytes)\n", (long long) LOCAL_MEM_SIZE);
				printf("Work Group Size:%d (Max)\n", (int) MAX_WORK_GROUP_SIZE);
				printf("Work Item Dim:\t%d (Max)\n", MAX_WORK_ITEM_DIMENSIONS);
				printf("Work Item Size:\t");
				for(j = 0; j < MAX_WORK_ITEM_DIMENSIONS; j ++)
				{
						if (j != (MAX_WORK_ITEM_DIMENSIONS -1))
						printf("%d, ", (int) MAX_WORK_ITEM_SIZES[j]);

						if (j == (MAX_WORK_ITEM_DIMENSIONS -1))
						printf("%d ", (int) MAX_WORK_ITEM_SIZES[j]);
				}
				printf("(Max)\n");

			}

				//printf("Got device info.\n");
		}


	}

	else if (MODE == 4){
		cl_context context = 0;
	    cl_command_queue commandQueue = 0;
	    cl_program program = 0;
	    cl_device_id device = 0;

	    //Create an OpenCL context on first available platform
	    context = CreateContext();
	    if (context == NULL)
	    {
	        printf("Failed to create OpenCL context.\n");
	        return 1;
	    }

	    //Create a command-queue on the first device available on the created context
	    commandQueue = CreateCommandQueue(context, &device);
	    if (commandQueue == NULL)
	    {
	    	printf("Failed to create commandQueue.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    // Create OpenCL program and store the binary for future use.
	    printf("Attempting to create kernel binary from source.\n");
	    program = CreateProgram(context, device, KERNELPATHIN);
	    if (program == NULL)
	    {
	    	printf("Failed to create Program");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    printf("Kernel is saved.\n");
	    if (SaveProgramBinary(program, device, KERNELPATHOUT) == false)
	    {
	        printf("Failed to write program binary.\n");
	        Cleanup(context, commandQueue, program, NULL);
	        return 1;
	     }

	    //printf("---Done---");

	    //return 1;

	}
	else if (MODE == 3){

		//todo free remaining objects not passed to cleanup

		//profiling
		int write_bytes = 0;
		int read_bytes = 0;
		/*unsigned long long start_cycles, stop_cycles;
		unsigned long long start_setup, stop_setup;
		unsigned long long start_write, stop_write;
		unsigned long long start_read, stop_read;
		unsigned long long start_finalize, stop_finalize;
		struct timespec start_time_t, stop_time_t;*/


		printf("Stream Mode\n\n");
		//clock_gettime(CLOCK_MONOTONIC, &start_time_t);
		//start_cycles = rdtsc();


		int i;
		time_t t;
		srand((unsigned) time(&t));

	    // Create the two input vectors
	    printf("\nHostside malloc(s)\n");
	    fflush(stdout);
	    int *A = (int*)malloc(sizeof(int)*(SIZE*SIZE));
	    int *B = (int*)malloc(sizeof(int)*(SIZE*SIZE));
	    int *C = (int*)malloc(sizeof(int)*(SIZE*SIZE));

	    //profile
	    //bytes += 3 * sizeof(int)*(SIZE*SIZE);

	    printf("\nHostside mat init\n");
	    fflush(stdout);
	    for(i = 0; i < (SIZE*SIZE); i++) {
	        A[i] = B[i] = rand() % 10 + 1;;
	    }


	    //print matrix
    	printf("Matrix A[%d][%d]:\n", SIZE, SIZE);
	   	for(i = 0; i < (SIZE*SIZE); i++)
	    {
	   		printf("%3d ", A[i]);
	   		if(((i + 1) % SIZE) == 0)
	   			printf("\n");
	    }

	    //print matrix
	   	printf("\nMatrix B[%d][%d]:\n", SIZE, SIZE);
	    for(i = 0; i < (SIZE*SIZE); i++)
	    {
	    	printf("%3d ", B[i]);
	        if(((i + 1) % SIZE) == 0)
	        	printf("\n");
	    }


	    //syscall(STATS_RESET);


	    //Get platform and device information
	    cl_context context = 0;
	    cl_command_queue commandQueue = 0;
	    cl_program program = 0;
	    cl_device_id device = 0;
	    cl_kernel kernel = 0;
	    cl_uint err = 0;
	    //char *filepath = NULL;

	    //Create the context
	    printf("\nCreateContext\n");
	    fflush(stdout);
	    context = CreateContext();
	    if (context == NULL)
	    {
	    	printf("Failed to create OpenCL context.\n");
	    	return 1;
	    }

	   /* printf("\nEnd CreateContext\n");
	    fflush(stdout);*/

	    //Create a command-queue on the first device available on the created context
	    printf("\nCreateCommandQueue\n");
	    fflush(stdout);
	    commandQueue = CreateCommandQueue(context, &device);
	    if (commandQueue == NULL)
	    {
	    	printf("Failed to create command queue.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    //create the program from the binary
	    //program = CreateProgramFromBinary(context, device, "/home/stardica/Desktop/Kernels/vector.cl.bin.GPU");
	    //strcat(KERNELPATHOUT, ".GPU")
	    printf("\nCreateProgramFromBinary\n");
	    fflush(stdout);
	    program = CreateProgramFromBinary(context, device, KERNEL);
	    if (program == NULL)
	    {
	    	printf("Failed to load kernel binary,\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }



	    // Create OpenCL kernel
	    printf("\nclCreateKernel\n");
	    fflush(stdout);
	    kernel = clCreateKernel(program, "Matrix", NULL);
	    if (kernel == NULL)
	    {
	    	printf("Failed to create kernel.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    cl_mem a_mem_obj = 0;
	    cl_mem b_mem_obj = 0;
	    cl_mem c_mem_obj = 0;

  	    //Create memory buffers on the device for each vector

	    printf("\nclCreateBuffer(s)\n");
	    fflush(stdout);
	    if(LOCALMEM == 1 && CACHEDMEM == 0)
		{
			//this creates uncached buffers in the GPU's local memory
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);

			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

		if(SYSMEM == 1 && CACHEDMEM == 0)
		{
			//this creates uncached buffers in the system memory
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

		if(SYSMEM == 1 && CACHEDMEM == 1)
		{
			//this creates cached buffers in the system memory.
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

	    if (a_mem_obj == NULL || b_mem_obj == NULL  || c_mem_obj == NULL)
	    {
	    	printf("Failed to create memory objects.\n");
	    	Cleanup(context, commandQueue, program, kernel);
	    	return 1;
	    }

	    //Copy the lists A and B to their respective memory buffers
	    printf("\nclEnqueueWriteBuffer(s)\n");
	    fflush(stdout);
	    write_bytes += 2 * sizeof(int)*(SIZE*SIZE);
	   // start_write = rdtsc();
	    clEnqueueWriteBuffer(commandQueue, a_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), A, 0, NULL, NULL);
	    clEnqueueWriteBuffer(commandQueue, b_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), B, 0, NULL, NULL);
	   // stop_write = rdtsc();


	    // Set the arguments of the kernel
	    int *size = (int *)SIZE;
	    printf("\nclSetKernelArg(s)\n");
	    fflush(stdout);
	    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj);
	    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&a_mem_obj);
	    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&b_mem_obj);
	    err = clSetKernelArg(kernel, 3, sizeof(int), (void *)&size);
	    if (err != CL_SUCCESS)
	    {
	    	printf("Kernel args not set.\n");
	    	return 1;
	    }

	    // Execute the OpenCL kernel on the list
	    size_t GlobalWorkSize[2], LocalWorkSize[2];

	    //Rember that in OpenCL we need to express the globalWorkSize in
	    //terms of the total number of threads. The underlying OpenCL API
	    //will look at the globalWorkSize and divide by the localWorkSize
	    //to arrive at a 64 by 64 NDRange of 16 by 16 work groups.

	    GlobalWorkSize[0] = GWS_0;//SIZE*SIZE*SIZE; // Process the entire lists
	    GlobalWorkSize[1] = GWS_1;//SIZE*SIZE*SIZE; // Process the entire lists
	    LocalWorkSize[0] = LWS_0; //SIZE Divide work items into groups of 64
	    LocalWorkSize[1] = LWS_1; //SIZE Divide work items into groups of 64


	    //used null for local, lets OpenCL determine the best local size.
	    //err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
	    printf("\nclEnqueueNDRangeKernel\n");
	    fflush(stdout);
	    err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
	    if (err != CL_SUCCESS)
	    {
	    	printf("ND range not enqueued. Code: %d\n", err);
	    	return 1;
	    }


	    //Read the memory buffer C on the device to the local variable C
	    printf("\nclEnqueueReadBuffer\n");
	    fflush(stdout);
	    read_bytes += sizeof(int)*(SIZE*SIZE);
	    //start_read = rdtsc();
	    err = clEnqueueReadBuffer(commandQueue, c_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), C, 0, NULL, NULL);
	   // stop_read = rdtsc();
	    if (err != CL_SUCCESS)
	    {
	    	printf("Buffer not returned.\n");
	    	return 1;
	    }

	    //syscall(STATS_STOP);

	    //print matrix
	    printf("\nMatrix C[%d][%d] = A[%d][%d]*B[%d][%d]:\n", SIZE, SIZE, SIZE, SIZE, SIZE, SIZE);
	    for(i = 0; i < (SIZE*SIZE); i++)
	    {
	    	printf("%3d ", C[i]);
	        if(((i + 1) % SIZE) == 0)
	        printf("\n");
	    }

	    printf("\nHostside clean up\n");
	    fflush(stdout);
	    err = clFlush(commandQueue);
	    err = clFinish(commandQueue);
	    Cleanup(context, commandQueue, program, kernel);
	    err = clReleaseMemObject(a_mem_obj);
	    err = clReleaseMemObject(b_mem_obj);
	    err = clReleaseMemObject(c_mem_obj);
	    free(A);
	    free(B);
	    free(C);

	    //printf("---Done---");



	    /*stop_cycles = rdtsc();
	    clock_gettime(CLOCK_MONOTONIC, &stop_time_t);
	    printf("Total cycles = %llu\n", (stop_cycles - start_cycles));

	    long int time_s = stop_time_t.tv_nsec - start_time_t.tv_nsec;
	    printf("Approximate runtime (check) = %ld ms\n", (time_s/1000000));

	    printf("Bytes written %d\n", write_bytes);
	    printf("transfer cycles = %llu\n", (stop_write - start_write));
	    printf("start at = %llu\n", (start_write - start_cycles));

	    printf("Bytes read %d\n", read_bytes);
	    printf("transfer cycles = %llu\n", (stop_read - start_read));
	    printf("start at = %llu\n", (start_read - start_cycles));*/

	}
	else if (MODE == 2){

		printf("Multi Thread Mode\n");
		//cal this:
		//assignToThisCore(0);//assign to core 0,1,2,...

		unsigned long long a, b;
	    int i = 0;
	    int j = 0;
	    int k = 0;

		LoadMatrices();

		pthread_t tid[SIZE*SIZE];


		//printf("waiting\n");
		//start our threads
		a = rdtsc();
		syscall(BEGIN_PARALLEL_SECTION);

		for(i=0;i<SIZE;i++){
			for(j=0;j<SIZE;j++){
				struct RowColumnData *RCData = (struct RowColumnData *) malloc(sizeof(struct RowColumnData));
				RCData->RowNum = i;
				RCData->ColumnNum = j;
				//printf("Thread create %d Row %d Col %d\n", k, RCData->RowNum, RCData->ColumnNum);
				pthread_create(&tid[k], NULL, RowColumnMultiply, RCData);
				k++;
			}
		}

		//Join threads////////////////////////////
		for (i=0;i<NUM_THREADS;i++)
		{
			pthread_join(tid[i], NULL);
		}
		syscall(END_PARALLEL_SECTION);
		b = rdtsc();

		PrintMatrices();



		//printf("\nend clock Cycles: %llu\n", b);
		printf("\nDone. Number of clock Cycles: %llu\n", b-a);

	}
	else if (MODE == 1)
	{

		printf("Single Thread Mode\n\n");
		//unsigned long long a, b;
		//a = rdtsc();
		//time_t t;
		int i,j,k;

		//srand((unsigned) time(&t));

		LoadMatrices();

		//multiply mats/////////////////////////
		for (i=0;i<SIZE;i++){
			for(j=0;j<SIZE;j++){
				for(k=0;k<SIZE;k++){
					matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
					}
			}
		}

		PrintMatrices();

		//b = rdtsc();
		//printf("\nDone. Number of clock Cycles: %llu\n", b-a);
	}
	else if (MODE == 0)
	{
		printf("---Misc Tests---\n\n");

		printf("size of long long is %d\n", (int) sizeof(long long));
		printf("size of long is %d\n", (int) sizeof(long));
		printf("size of int is %d\n", (int) sizeof(int));
		printf("size of short is %d\n", (int) sizeof(short));
		printf("size of char * %d\n", (int) sizeof(char *));
		printf("size of unsigned int (word) %d\n", (int) sizeof(unsigned int));

		char *string = "test string";
		printf("Here is the string 1: \"%s\"\n", string);

		//Using the struct
		//set string variable and point to print_me.
		object.string = strdup(string);
		object.print_me = (void (*)(void *)) print_me;

		//use of print_me
		object.print_me(object.string);

		//pointer fun
		struct Object *ptr = &object;
		printf("this is the value of the pointer to struct object: %p\n", ptr);
		object.next=&object;
		printf("this is the value of the pointer to struct object: %p\n", object.next);
		object_ptr = &object;
		object_ptr->next = &object;
		printf("this is the value of the pointer to struct object: %p\n", object_ptr->next);

		//Macro fun
		PRINT(ptr, ptr);
		PRINT(object.next, object.next);
		PRINT(object_ptr->next, object_ptr->next);

		int mmu_page_size = 1 << 12;

		printf("mmu_papge_size = %d\n", mmu_page_size);


		//setjmp and longjmp fun
		/*jmp_buf environment;
		int i;

		i = setjmp(environment);
		printf("\n\nsetjmp returned = %d\n", i);

		printf("Env 1:\n");

		int x = 0;
		for(x = 0; x < 6; x++)
		{
			printf("  %x\n", environment[x]);
		}


		if (i < 3)
		{
			longjmp(environment, 3);
		}

		printf("longjmp finished with i = %d\n", i);*/


	}
	else
	{

		printf("---Invalid Mode Set---\n\n");

	}

	printf("\n---Done---\n");
	return 1;
}
Example #21
0
int main(int argc, char **argv) {




	if (find_option(argc, argv, "-h") >= 0)
	{
		printf("Options:\n");
		printf("-h to see this help\n");
		printf("-n <int> to set the number of particles\n");
		printf("-o <filename> to specify the output file name\n");
		printf("-s <filename> to specify the summary output file name\n");
		return 0;
	}


	int n = read_int(argc, argv, "-n", 1000);

	char *savename = read_string(argc, argv, "-o", NULL);
	char *sumname = read_string(argc, argv, "-s", NULL);

	// For return values.
	cl_int ret;

	// OpenCL stuff.
	// Loading kernel files.
	FILE *kernelFile;
	char *kernelSource;
	size_t kernelSize;

	kernelFile = fopen("simulationKernel.cl", "r");

	if (!kernelFile) {
		fprintf(stderr, "No file named simulationKernel.cl was found\n");
		exit(-1);
	}
	kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
	kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile);
	fclose(kernelFile);

	// Getting platform and device information
	cl_platform_id platformId = NULL;
	cl_device_id deviceID = NULL;
	cl_uint retNumDevices;
	cl_uint retNumPlatforms;
	ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms);
	// Different types of devices to pick from. At the moment picks the default opencl device.
	//CL_DEVICE_TYPE_GPU
	//CL_DEVICE_TYPE_ACCELERATOR
	//CL_DEVICE_TYPE_DEFAULT
	//CL_DEVICE_TYPE_CPU
	ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices);

	// Max workgroup size
	size_t max_available_local_wg_size;
	ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL);
	// Creating context.
	cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret);


	// Creating command queue
        cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret);
	
	// Build program
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret);
//	printf("program = ret %i \n", ret);
	ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
//	printf("clBuildProgram: ret %i \n", ret);
	
	// Create kernels
	cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret);

	cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret);

	cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret);
	cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret);

	FILE *fsave = savename ? fopen(savename, "w") : NULL;
	FILE *fsum = sumname ? fopen(sumname, "a") : NULL;
	particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t));

	// GPU particle data structure
	cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret);

	// Set size
	set_size(n);

	init_particles(n, particles);

	double copy_time = read_timer();

	// Copy particles to device.
	ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL);
	copy_time = read_timer() - copy_time;
	

	// Calculating thread and thread block counts.
	// sizes
	size_t globalItemSize;
	size_t localItemSize;
	// Global item size
	if (n <= NUM_THREADS) {
		globalItemSize = NUM_THREADS;
		localItemSize = 16;
	}
	else if (n % NUM_THREADS != 0) {
		globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS;
	}
	else {
		globalItemSize = n;
	}

	// Local item size
	localItemSize = globalItemSize / NUM_THREADS;	

	// Bins and bin sizes.
	// Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10.
	// There will never be 10 particles in one bin.
	int maxParticles = 10;
	
	// Calculating the number of bins.
	int numberOfBins = (int)ceil(size/(2*cutoff)) + 2;
	
	// Bins will only exist on the device.
	particle_t* bins;
	
	// How many particles are there in each bin - also only exists on the device.
	volatile int* binSizes;
	
	// Number of bins to be initialized.
	size_t clearAmt = numberOfBins*numberOfBins;
	
	// Allocate memory for bins on the device.
	cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret);
	cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret);
	
	// SETTING ARGUMENTS FOR THE KERNELS
	
	// Set arguments for the init / clear kernel
	ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins);

	// Set arguments for the binning kernel
	ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(binKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins);

	// Set arguments for force kernel.
	ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins);


	// Set arguments for move kernel
	ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size);
	
	
	// Variable to check if kernel execution is done.
	cl_event kernelDone;
	
	
	double simulation_time = read_timer();
	int step = 0;
	for (step = 0; step < NSTEPS; step++) {


		// Execute bin initialization (clearing after first iteration)
		ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute binning kernel
		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
//		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);	
		// Execute force kernel
		ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute move kernel
		ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);

		if (fsave && (step%SAVEFREQ) == 0) {
			// Copy the particles back to the CPU
			ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone);
			ret = clWaitForEvents(1, &kernelDone);

			save(fsave, n, particles);
		}

	}
	simulation_time = read_timer() - simulation_time;
	printf("CPU-GPU copy time = %g seconds\n", copy_time);
	printf("n = %d, simulation time = %g seconds\n", n, simulation_time);

	if (fsum)
		fprintf(fsum, "%d %lf \n", n, simulation_time);

	if (fsum)
		fclose(fsum);
	free(particles);
	if (fsave)
		fclose(fsave);


	ret = clFlush(commandQueue);
	ret = clFinish(commandQueue);
	ret = clReleaseCommandQueue(commandQueue);
	ret = clReleaseKernel(forceKernel);
	ret = clReleaseKernel(moveKernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(d_particles);
	ret = clReleaseContext(context);


	return 0;
}
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads)
{
	/* initialize highlighting */
	if (!g_highlightInitialized) {
		if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead);
		if (g_highlightedNodes)     MEM_freeN(g_highlightedNodes);

		g_highlightedNodesRead = NULL;
		g_highlightedNodes = NULL;

		COM_startReadHighlights();

		g_highlightInitialized = true;
	}

#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
	/* deinitialize if number of threads doesn't match */
	if (g_cpudevices.size() != num_cpu_threads) {
		Device *device;

		while (g_cpudevices.size() > 0) {
			device = g_cpudevices.back();
			g_cpudevices.pop_back();
			device->deinitialize();
			delete device;
		}

		g_cpuInitialized = false;
	}

	/* initialize CPU threads */
	if (!g_cpuInitialized) {
		for (int index = 0; index < num_cpu_threads; index++) {
			CPUDevice *device = new CPUDevice();
			device->initialize();
			g_cpudevices.push_back(device);
		}

		g_cpuInitialized = true;
	}

#ifdef COM_OPENCL_ENABLED
	/* deinitialize OpenCL GPU's */
	if (use_opencl && !g_openclInitialized) {
		g_context = NULL;
		g_program = NULL;

		if (clewInit() != CLEW_SUCCESS) /* this will check for errors and skip if already initialized */
			return;

		if (clCreateContextFromType) {
			cl_uint numberOfPlatforms = 0;
			cl_int error;
			error = clGetPlatformIDs(0, 0, &numberOfPlatforms);
			if (error == -1001) { }   /* GPU not supported */
			else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
			if (G.f & G_DEBUG) printf("%u number of platforms\n", numberOfPlatforms);
			cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__);
			error = clGetPlatformIDs(numberOfPlatforms, platforms, 0);
			unsigned int indexPlatform;
			for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) {
				cl_platform_id platform = platforms[indexPlatform];
				cl_uint numberOfDevices = 0;
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices);
				if (numberOfDevices <= 0)
					continue;

				cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__);
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0);

				g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error);
				if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
				const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL};
				g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error);
				error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0);
				if (error != CL_SUCCESS) {
					cl_int error2;
					size_t ret_val_size = 0;
					printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__);
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					build_log[ret_val_size] = '\0';
					printf("%s", build_log);
					MEM_freeN(build_log);
				}
				else {
					unsigned int indexDevices;
					for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) {
						cl_device_id device = cldevices[indexDevices];
						cl_int vendorID = 0;
						cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL);
						if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); }
						OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID);
						clDevice->initialize();
						g_gpudevices.push_back(clDevice);
					}
				}
				MEM_freeN(cldevices);
			}
			MEM_freeN(platforms);
		}

		g_openclInitialized = true;
	}
#endif
#endif
}
Example #23
0
int main( void )
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX;
    float *X;
    cl_event event = NULL;
    int ret = 0;

    const size_t N0 = 4, N1 = 4, N2 = 4;
    char platform_name[128];
    char device_name[128];

    /* FFT library realted declarations */
    clfftPlanHandle planHandle;
    clfftDim dim = CLFFT_3D;
    size_t clLengths[3] = {N0, N1, N2};

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );

    size_t ret_param_size = 0;
    err = clGetPlatformInfo(platform, CL_PLATFORM_NAME,
            sizeof(platform_name), platform_name,
            &ret_param_size);
    printf("Platform found: %s\n", platform_name);

    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL );

    err = clGetDeviceInfo(device, CL_DEVICE_NAME,
            sizeof(device_name), device_name,
            &ret_param_size);
    printf("Device found on the above platform: %s\n", device_name);

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    queue = clCreateCommandQueue( ctx, device, 0, &err );

    /* Setup clFFT. */
    clfftSetupData fftSetup;
    err = clfftInitSetupData(&fftSetup);
    err = clfftSetup(&fftSetup);

    /* Allocate host & initialize data. */
    /* Only allocation shown for simplicity. */
    size_t buffer_size  = N0 * N1 * N2 * 2 * sizeof(*X);
    X = (float *)malloc(buffer_size);

    /* print input array just using the
     * indices to fill the array with data */
    printf("\nPerforming fft on an three dimensional array of size N0 x N1 x N2 : %ld x %ld x %ld\n", N0, N1, N2);
    int i, j, k;
    i = j = k = 0;
    for (i=0; i<N0; ++i) {
        for (j=0; j<N1; ++j) {
            for (k=0; k<N2; ++k) {
                float x = 0.0f;
                float y = 0.0f;
                if (i==0 && j==0 && k==0) {
                    x = y = 0.5f;
                }
                unsigned idx = 2*(k+j*N1+i*N0*N1);
                X[idx] = x;
                X[idx+1] = y;
                printf("(%f, %f) ", X[idx], X[idx+1]);
            }
            printf("\n");
        }
        printf("\n");
    }

    /* Prepare OpenCL memory objects and place data inside them. */
    bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, buffer_size, NULL, &err );

    err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL );

    /* Create a default plan for a complex FFT. */
    err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths);

    /* Set plan parameters. */
    err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE);
    err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
    err = clfftSetResultLocation(planHandle, CLFFT_INPLACE);

    /* Bake the plan. */
    err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL);

    /* Execute the plan. */
    err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL);

    /* Wait for calculations to be finished. */
    err = clFinish(queue);

    /* Fetch results of calculations. */
    err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL );

    /* print output array */
    printf("\n\nfft result: \n");
    i = j = k = 0;
    for (i=0; i<N0; ++i) {
        for (j=0; j<N1; ++j) {
            for (k=0; k<N2; ++k) {
                unsigned idx = 2*(k+j*N1+i*N0*N1);
                printf("(%f, %f) ", X[idx], X[idx+1]);
            }
            printf("\n");
        }
        printf("\n");
    }
    printf("\n");

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufX );

    free(X);

    /* Release the plan. */
    err = clfftDestroyPlan( &planHandle );

    /* Release clFFT library. */
    clfftTeardown( );

    /* Release OpenCL working objects. */
    clReleaseCommandQueue( queue );
    clReleaseContext( ctx );

    return ret;
}
Example #24
0
/*
 * pgstrom_collect_device_info
 *
 * It collects properties of all the OpenCL devices. It shall be called once
 * by the OpenCL management worker process, prior to any other backends.
 */
static List *
construct_opencl_device_info(int platform_index)
{
	cl_platform_id	platforms[32];
	cl_device_id	devices[MAX_NUM_DEVICES];
	cl_uint			n_platform;
	cl_uint			n_devices;
	cl_int			i, j, rc;
	long			score_max = -1;
	List		   *result = NIL;

	rc = clGetPlatformIDs(lengthof(platforms),
						  platforms,
						  &n_platform);
	if (rc != CL_SUCCESS)
		elog(ERROR, "clGetPlatformIDs failed (%s)", opencl_strerror(rc));

	for (i=0; i < n_platform; i++)
	{
		pgstrom_platform_info  *pl_info;
		pgstrom_device_info	   *dev_info;
		long		score = 0;
		List	   *temp = NIL;

		pl_info = collect_opencl_platform_info(platforms[i]);
		pl_info->pl_index = i;

		rc = clGetDeviceIDs(platforms[i],
							CL_DEVICE_TYPE_CPU |
							CL_DEVICE_TYPE_GPU |
							CL_DEVICE_TYPE_ACCELERATOR,
							lengthof(devices),
							devices,
							&n_devices);
		if (rc != CL_SUCCESS)
			elog(ERROR, "clGetDeviceIDs failed (%s)", opencl_strerror(rc));

		elog(LOG, "PG-Strom: [%d] OpenCL Platform: %s", i, pl_info->pl_name);

		for (j=0; j < n_devices; j++)
		{
			dev_info = collect_opencl_device_info(devices[j]);
			dev_info->pl_info = pl_info;
			dev_info->dev_index = j;

			elog(LOG, "PG-Strom:  + device %s (%uMHz x %uunits, %luMB)",
				 dev_info->dev_name,
				 dev_info->dev_max_clock_frequency,
				 dev_info->dev_max_compute_units,
				 dev_info->dev_global_mem_size >> 20);

			/* rough estimation about computing power */
			if ((dev_info->dev_type & CL_DEVICE_TYPE_GPU) != 0)
				score += 32 * (dev_info->dev_max_compute_units *
							   dev_info->dev_max_clock_frequency);
			else
				score += (dev_info->dev_max_compute_units *
						  dev_info->dev_max_clock_frequency);

			temp = lappend(temp, dev_info);
		}

		if (platform_index == i || (platform_index < 0 && score > score_max))
		{
			opencl_platform_id = platforms[i];
			opencl_num_devices = n_devices;
			for (j=0; j < n_devices; j++)
				opencl_devices[j] = devices[j];

			score_max = score;
			result = temp;
		}
	}

	/* show platform name if auto-selection */
	if (platform_index < 0 && result != NIL)
	{
		pgstrom_platform_info *pl_info
			= ((pgstrom_device_info *) linitial(result))->pl_info;
		elog(LOG, "PG-Strom: auto platform selection: %s", pl_info->pl_name);
	}

	if (result != NIL)
	{
		/*
		 * Create an OpenCL context
		 */
		opencl_context = clCreateContext(NULL,
										 opencl_num_devices,
										 opencl_devices,
										 NULL,
										 NULL,
										 &rc);
		if (rc != CL_SUCCESS)
			elog(ERROR, "clCreateContext failed: %s", opencl_strerror(rc));

		/*
		 * Create an OpenCL command queue for each device
		 */
		for (j=0; j < opencl_num_devices; j++)
		{
			opencl_cmdq[j] =
				clCreateCommandQueue(opencl_context,
									 opencl_devices[j],
									 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
									 CL_QUEUE_PROFILING_ENABLE,
									 &rc);
			if (rc != CL_SUCCESS)
				elog(ERROR, "clCreateCommandQueue failed: %s",
					 opencl_strerror(rc));
		}
	}
	return result;
}
Example #25
0
int main(void) {
//time meassuring
  	struct timeval tvs;

//variables
	int 	Nx=1024;
	int		Ny=1024;
	int 	plotnum=0;
	int	  	Tmax=2;
	int 	plottime=0;
	int	  	plotgap=1;
	double	Lx=1.0;
	double 	Ly=1.0;
	double	dt=0.0;	
	double	A=0.0;
	double	B=0.0;
	double	Du=0.0;
	double	Dv=0.0;
//splitting coefficients
	double	a=0.5;	
	double 	b=0.5;
	double 	c=1.0;
//loop counters	
	int i=0;
	int j=0;
	int n=0;

	double*umax=NULL;
	double*vmax=NULL;
	parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B);
	plottime=plotgap;
	vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double));
	umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double));
//openCL variables
    cl_platform_id *platform_id = NULL;
    cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL;
	cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL;
    cl_int ret;
    cl_uint num_platforms;
// Detect how many platforms there are.
	ret = clGetPlatformIDs(0, NULL, &num_platforms);
// Allocate enough space for the number of platforms.
	platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id));
// Store the platforms
	ret = clGetPlatformIDs(num_platforms, platform_id, NULL);
	printf("Found %d platform(s)!\n",num_platforms);
    cl_uint *num_devices;
	num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint));
    cl_device_id **device_id = NULL;
	device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*));
// Detect number of devices in the platforms
	for(i=0;i<num_platforms;i++){
		char buf[65536];
		size_t size;
		ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size);
		printf("%s\n",buf);
		ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices);
		printf("Found %d device(s) on platform %d!\n", num_devices[i],i);
		ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size);
		printf("%s ",buf);
// Store numDevices from platform
		device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id));
		ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL);
		for(j=0;j<num_devices[i];j++){
			ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size);
			printf("%s (%d,%d)\n",buf,i,j);
		}
	}
//create context and command_queue
    cl_context context = NULL;
   	cl_command_queue command_queue = NULL;
//Which platform and device do i choose?
	int	chooseplatform=0;
	int	choosedevice=0;	  
	printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice);
	context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret);
	if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); }
	command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret);
	if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); }

//OpenCL arrays
    cl_mem cl_u = NULL,cl_v = NULL;
   	cl_mem cl_uhat = NULL, cl_vhat = NULL;
    cl_mem cl_kx = NULL, cl_ky = NULL;

//FFT
	clfftPlanHandle planHandle;
    cl_mem tmpBuffer = NULL;
	fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny);

//allocate gpu memory/
	cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret);
	cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret);

	printf("allocated space\n");
//load the kernels
	loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies");
	loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); 
	loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); 
	loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); 
	loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); 

	size_t global_work_size[1] = {Nx*Ny};
	size_t global_work_size_X[1] = {Nx};
	size_t global_work_size_Y[1] = {Ny};
//frequencies
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//printCL(&cl_kx,&command_queue,Nx,1);
//printCL(&cl_ky,&command_queue,1,Ny);
//inintial data
    ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly);
    ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//make output
    writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
    umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	printf("Got initial data, starting timestepping\n");
	mtime_s(&tvs);

	for(n=0;n<=Tmax;n++){
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);	

//nonlinearpart_b
    ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

//linear
	fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer);
	fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer);
//printf("A%f,B%f\n",A,B);
    ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat);
    ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat);
	ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx);
	ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky);
	ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du);
	ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv);
	ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A);
	ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B);
	ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c);
	ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer);
  	fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer);

//nonlinearpart_b
    ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);		
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);	
// done
	if(n==plottime){
		printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]);
		plottime=plottime+plotgap;
		plotnum=plotnum+1;
   	 	writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    	writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
        umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
        vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	}
}//end timestepping

	printf("Finished time stepping\n");
	mtime_e(&tvs,"Programm took:");
	writearray(umax,(Tmax/plotgap)+1,"u");
	writearray(vmax,(Tmax/plotgap)+1,"v");
	free(umax);
	free(vmax);	

	clReleaseMemObject(cl_u);
	clReleaseMemObject(cl_v);
	clReleaseMemObject(cl_uhat);
	clReleaseMemObject(cl_vhat);
	clReleaseMemObject(cl_kx);
	clReleaseMemObject(cl_ky);

    ret = clReleaseKernel(initialdata); 
    ret = clReleaseKernel(frequencies); 
    ret = clReleaseKernel(linearpart); 
    ret = clReleaseKernel(nonlinearpart_a);
    ret = clReleaseKernel(nonlinearpart_b);

	fftdestroy(&planHandle, &tmpBuffer);

	ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

	for(i=0;i<num_platforms;i++){free(device_id[i]);}
	free(device_id);
	free(platform_id);
	free(num_devices);
	printf("Program execution complete\n");

	return 0;
}
Example #26
0
xcl_world xcl_world_single(cl_device_type device_type, char *target_vendor, char *target_device) {
	int err;
	xcl_world world;
	cl_uint num_platforms;

	err = clGetPlatformIDs(0, NULL, &num_platforms);
	if (err != CL_SUCCESS) {
		printf("Error: no platforms available or OpenCL install broken");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	cl_platform_id *platform_ids = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);

	if (platform_ids == NULL) {
		printf("Error: Out of Memory\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	err = clGetPlatformIDs(num_platforms, platform_ids, NULL);
	if (err != CL_SUCCESS) {
		printf("Error: Failed to find an OpenCL platform!\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	int i;
        char cl_platform_vendor[1001];
        //find target vendor if target_vendor is specified
        if (target_vendor != NULL) {
                for(i = 0; i < num_platforms; i++) {
                        err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL);
                        if (err != CL_SUCCESS) {
                                printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
                                printf("Test failed\n");
                                exit(EXIT_FAILURE);
                        }
                        if ((target_vendor != NULL) && (strcmp(cl_platform_vendor, target_vendor) == 0)) {
                                printf("INFO: Selected platform %d from %s\n", i, cl_platform_vendor);
                                world.platform_id = platform_ids[i];
                                break;
                        }
                }
        } else {
                for(i = 0; i < num_platforms; i++) {
                        err = clGetDeviceIDs(platform_ids[i], device_type,
                                             1, &world.device_id, NULL);
                        if (err == CL_SUCCESS) {
                                world.platform_id = platform_ids[i];
                                break;
                        }
                }            
        }
	free(platform_ids);
	if (i == num_platforms) {
		printf("Error: Failed to find a platform\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

        if (target_device != NULL) {
                //find target device
                cl_device_id devices[16];  // compute device id 
                cl_uint num_devices;
                char cl_device_name[100];
                err = clGetDeviceIDs(world.platform_id, CL_DEVICE_TYPE_ACCELERATOR,
                                     16, devices, &num_devices);
                if (err != CL_SUCCESS) {
                        printf("Error: Failed to create a device group!\n");
                        printf("Test failed\n");
                        exit(EXIT_FAILURE);
                }

                //iterate all devices to select the target device. 
                for (i=0; i<num_devices; i++) {
                        err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 100, cl_device_name, 0);
                        if (err != CL_SUCCESS) {
                                printf("Error: Failed to get device name for device %d!\n", i);
                                printf("Test failed\n");
                                exit(EXIT_FAILURE);
                        }
                        //printf("CL_DEVICE_NAME %s\n", cl_device_name);
                        if (strcmp(cl_device_name, target_device) == 0) {
                                world.device_id = devices[i];
                                printf("INFO: Selected %s as the target device\n", cl_device_name);
                                break;
                        }
                }

                if (i == num_devices) {
                        printf("Error: Failed to find target device %s\n", target_device);
                        printf("Test failed\n");
                        exit(EXIT_FAILURE);
                }
        }

	world.context = clCreateContext(0, 1, &world.device_id,
	                                NULL, NULL, &err);
	if (err != CL_SUCCESS) {
		printf("Error: Failed to create a compute context!\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	world.command_queue = clCreateCommandQueue(world.context,
	                                           world.device_id,
	                                           CL_QUEUE_PROFILING_ENABLE,
	                                           &err);
	if (err != CL_SUCCESS) {
		printf("Error: Failed to create a command queue!\n");
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	return world;
}
Example #27
0
	int main()
	{
	cl_int num_rand = 4096*256; /* The number of random numbers generated using one generator */
	int count_all, i, num_generator = sizeof(mts)/sizeof(mts[0]); /* The number of generators */
	double pi;
	cl_platform_id platform_id = NULL;
	cl_uint ret_num_platforms;
	cl_device_id device_id = NULL;
	cl_uint ret_num_devices;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_program program = NULL;
	cl_kernel kernel_mt = NULL, kernel_pi = NULL;
	size_t kernel_code_size;
	char *kernel_src_str;
	cl_uint *result;
	cl_int ret;
	FILE *fp;
	cl_mem rand, count;
	size_t global_item_size[3], local_item_size[3];
	cl_mem dev_mts;
	cl_event ev_mt_end, ev_pi_end, ev_copy_end;
	cl_ulong prof_start, prof_mt_end, prof_pi_end, prof_copy_end;
 
	clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
	&ret_num_devices);
	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
	result = (cl_uint*)malloc(sizeof(cl_uint)*num_generator);
 
	command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
	fp = fopen("mt.cl", "r");
	kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE);
	kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);
 
	/* Create output buffer */
	rand = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_rand*num_generator, NULL, &ret);
	count = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_generator, NULL, &ret);
 
	/* Build Program*/
	program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
	(const size_t *)&kernel_code_size, &ret);
	clBuildProgram(program, 1, &device_id, "", NULL, NULL);
	kernel_mt = clCreateKernel(program, "genrand", &ret);
	kernel_pi = clCreateKernel(program, "calc_pi", &ret);
 
	/* Create input parameter */
	dev_mts = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(mts), NULL, &ret);
	clEnqueueWriteBuffer(command_queue, dev_mts, CL_TRUE, 0, sizeof(mts), mts, 0, NULL, NULL);
 
	/* Set Kernel Arguments */
	clSetKernelArg(kernel_mt, 0, sizeof(cl_mem), (void*)&rand); /* Random numbers (output of genrand) */
	clSetKernelArg(kernel_mt, 1, sizeof(cl_mem), (void*)&dev_mts); /* MT parameter (input to genrand) */
	clSetKernelArg(kernel_mt, 2, sizeof(num_rand), &num_rand); /* Number of random numbers to generate */
 
	clSetKernelArg(kernel_pi, 0, sizeof(cl_mem), (void*)&count); /* Counter for points within circle (output of calc_pi) */
	clSetKernelArg(kernel_pi, 1, sizeof(cl_mem), (void*)&rand); /* Random numbers (input to calc_pi) */
	clSetKernelArg(kernel_pi, 2, sizeof(num_rand), &num_rand); /* Number of random numbers used */
 
	global_item_size[0] = num_generator; global_item_size[1] = 1; global_item_size[2] = 1;
	local_item_size[0] = num_generator; local_item_size[1] = 1; local_item_size[2] = 1;
 
	/* Create a random number array */
	clEnqueueNDRangeKernel(command_queue, kernel_mt, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_mt_end);
 
	/* Compute PI */
	clEnqueueNDRangeKernel(command_queue, kernel_pi, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_pi_end);
 
	/* Get result */
	clEnqueueReadBuffer(command_queue, count, CL_TRUE, 0, sizeof(cl_uint)*num_generator, result, 0, NULL, &ev_copy_end);
 
	/* Average the values of PI */
	count_all = 0;
	for (i=0; i < num_generator; i++) {
	count_all += result[i];
	}
 
	pi = ((double)count_all)/(num_rand * num_generator) * 4;
	printf("pi = %f\n", pi);
 
	/* Get execution time info */
	clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &prof_start, NULL);
	clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_mt_end, NULL);
	clGetEventProfilingInfo(ev_pi_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_pi_end, NULL);
	clGetEventProfilingInfo(ev_copy_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_copy_end, NULL);
 
	printf(" mt: %f[ms]\n"
		" pi: %f[ms]\n"
		" copy: %f[ms]\n",
		(prof_mt_end - prof_start)/(1000000.0),
		(prof_pi_end - prof_mt_end)/(1000000.0),
		(prof_copy_end - prof_pi_end)/(1000000.0));
 
	clReleaseEvent(ev_mt_end);
	clReleaseEvent(ev_pi_end);
	clReleaseEvent(ev_copy_end);
 
	clReleaseMemObject(rand);
	clReleaseMemObject(count);
	clReleaseKernel(kernel_mt);
	clReleaseKernel(kernel_pi);
	clReleaseProgram(program);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);
	free(kernel_src_str);
	free(result);
	return 0;
}
Example #28
0
void print_clinfo ()
{
  char *s = NULL;
  size_t len;
  unsigned i, j;
  cl_uint platform_count;
  cl_platform_id *platforms;

  /* Determine number of OpenCL Platforms available.  */
  clGetPlatformIDs (0, NULL, &platform_count);
  printf ("number of OpenCL Platforms available:\t%d\n", platform_count);
  /* Get platforms.  */
  platforms
    = (cl_platform_id*) malloc (sizeof (cl_platform_id) * platform_count);
  if (platforms == NULL)
    {
      fprintf (stderr, "malloc failed\n");
      exit (EXIT_FAILURE);
    }
  clGetPlatformIDs (platform_count, platforms, NULL);

  /* Querying platforms.  */
  for (i = 0; i < platform_count; i++)
    {
      cl_device_id *devices;
      cl_uint device_count;
      cl_device_id default_dev;
      printf (" OpenCL Platform:                       %d\n", i);

#define PRINT_PF_INFO(PARM)\
      clGetPlatformInfo (platforms[i], PARM, 0, NULL, &len); \
      s = realloc (s, len); \
      clGetPlatformInfo (platforms[i], PARM, len, s, NULL); \
      printf ("  %-36s%s\n", #PARM ":", s);

      PRINT_PF_INFO (CL_PLATFORM_PROFILE)
      PRINT_PF_INFO (CL_PLATFORM_VERSION)
      PRINT_PF_INFO (CL_PLATFORM_NAME)
      PRINT_PF_INFO (CL_PLATFORM_VENDOR)
      PRINT_PF_INFO (CL_PLATFORM_EXTENSIONS)
#undef PRINT_PF_INFO

      clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_DEFAULT, 1, &default_dev,
		      NULL);
      clGetDeviceInfo (default_dev, CL_DEVICE_NAME, 0, NULL, &len);
      s = realloc (s, len);
      clGetDeviceInfo (default_dev, CL_DEVICE_NAME, len, s, NULL);
      printf ("  CL_DEVICE_TYPE_DEFAULT:             %s\n", s);

      /* Determine number of devices.  */
      clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &device_count);
      printf ("\n  number of OpenCL Devices available:   %d\n", device_count);
      /* Get devices.  */
      devices = (cl_device_id*) malloc (sizeof (cl_device_id) * device_count);
      if (devices == NULL)
	{
	  fprintf (stderr, "malloc failed\n");
	  exit (EXIT_FAILURE);
	}
      clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, device_count, devices,
		      NULL);

      /* Querying devices.  */
      for (j = 0; j < device_count; j++)
	{
	  cl_device_type dtype;
	  cl_device_mem_cache_type mctype;
	  cl_device_local_mem_type mtype;
	  cl_device_fp_config fpcfg;
	  cl_device_exec_capabilities xcap;
	  cl_command_queue_properties qprops;
	  cl_bool clbool;
	  cl_uint cluint;
	  cl_ulong clulong;
	  size_t sizet;
	  size_t workitem_size[3];
	  printf ("   OpenCL Device:                       %d\n", j);

#define PRINT_DEV_INFO(PARM)\
	  clGetDeviceInfo (devices[j], PARM, 0, NULL, &len); \
	  s = realloc (s, len); \
	  clGetDeviceInfo (devices[j], PARM, len, s, NULL); \
	  printf ("    %-41s%s\n", #PARM ":", s);

	  PRINT_DEV_INFO (CL_DEVICE_NAME)
	  PRINT_DEV_INFO (CL_DRIVER_VERSION)
	  PRINT_DEV_INFO (CL_DEVICE_VENDOR)
	  clGetDeviceInfo (devices[j], CL_DEVICE_VENDOR_ID, sizeof (cluint),
			   &cluint, NULL);
	  printf ("    CL_DEVICE_VENDOR_ID:                     %d\n", cluint);

	  clGetDeviceInfo (devices[j], CL_DEVICE_TYPE, sizeof (dtype), &dtype, NULL);
	  if (dtype & CL_DEVICE_TYPE_CPU)
	    printf ("    CL_DEVICE_TYPE:                          CL_DEVICE_TYPE_CPU\n");
	  if (dtype & CL_DEVICE_TYPE_GPU)
	    printf ("    CL_DEVICE_TYPE:                          CL_DEVICE_TYPE_GPU\n");
	  if (dtype & CL_DEVICE_TYPE_ACCELERATOR)
	    printf ("    CL_DEVICE_TYPE:                          CL_DEVICE_TYPE_ACCELERATOR\n");
	  if (dtype & CL_DEVICE_TYPE_DEFAULT)
	    printf ("    CL_DEVICE_TYPE:                          CL_DEVICE_TYPE_DEFAULT\n");

	  clGetDeviceInfo (devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_MAX_CLOCK_FREQUENCY:           %d\n", cluint);

	  PRINT_DEV_INFO (CL_DEVICE_PROFILE)
	  PRINT_DEV_INFO (CL_DEVICE_EXTENSIONS)

	  clGetDeviceInfo (devices[j], CL_DEVICE_AVAILABLE, sizeof (clbool), &clbool, NULL);
	  if (clbool == CL_TRUE)
	    printf ("    CL_DEVICE_AVAILABLE:                     CL_TRUE\n");
	  else
	    printf ("    CL_DEVICE_AVAILABLE:                     CL_FALSE\n");
	  clGetDeviceInfo (devices[j], CL_DEVICE_ENDIAN_LITTLE, sizeof (clbool), &clbool, NULL);
	  if (clbool == CL_TRUE)
	    printf ("    CL_DEVICE_ENDIAN_LITTLE:                 CL_TRUE\n");
	  else
	    printf ("    CL_DEVICE_ENDIAN_LITTLE:                 CL_FALSE\n");

	  clGetDeviceInfo (devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_MAX_COMPUTE_UNITS:             %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (sizet), &sizet, NULL);
	  printf ("    CL_DEVICE_MAX_WORK_GROUP_SIZE:           %d\n", sizet);
	  clGetDeviceInfo (devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:      %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof (workitem_size), &workitem_size, NULL);
	  printf ("    CL_DEVICE_MAX_WORK_ITEM_SIZES:           %d / %d / %d\n", workitem_size[0], workitem_size[1], workitem_size[2]);

	  clGetDeviceInfo (devices[j], CL_DEVICE_ADDRESS_BITS, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_ADDRESS_BITS:                  %d\n", cluint);

	  clGetDeviceInfo (devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (clulong), &clulong, NULL);
	  printf ("    CL_DEVICE_MAX_MEM_ALLOC_SIZE:            %llu\n", clulong);
	  clGetDeviceInfo (devices[j], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_MEM_BASE_ADDR_ALIGN:           %d\n", cluint);
	  clGetDeviceInfo(devices[j], CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:      %d\n", cluint);
	  clGetDeviceInfo(devices[j], CL_DEVICE_MAX_PARAMETER_SIZE, sizeof (sizet), &sizet, NULL);
	  printf ("    CL_DEVICE_MAX_PARAMETER_SIZE:            %d\n", sizet);
	  clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof (clulong), &clulong, NULL);
	  printf ("    CL_DEVICE_GLOBAL_MEM_SIZE:               %llu\n", clulong);

	  clGetDeviceInfo (devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof (mctype), &mctype, NULL);
	  if (mctype & CL_NONE)
	    printf ("    CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:         CL_NONE\n");
	  if (mctype & CL_READ_ONLY_CACHE)
	    printf ("    CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:         CL_READ_ONLY_CACHE\n");
	  if (mctype & CL_READ_WRITE_CACHE)
	    printf ("    CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:         CL_READ_WRITE_CACHE\n");

	  clGetDeviceInfo (devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof (clulong), &clulong, NULL);
	  printf ("    CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:         %llu\n", clulong);
	  clGetDeviceInfo (devices[j], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:     %d\n", cluint);

	  clGetDeviceInfo (devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof (mtype), &mtype, NULL);
	  if (mtype & CL_LOCAL)
	    printf ("    CL_DEVICE_LOCAL_MEM_TYPE:                CL_LOCAL\n");
	  if (mtype & CL_GLOBAL)
	    printf ("    CL_DEVICE_LOCAL_MEM_TYPE:                CL_GLOBAL\n");

	  clGetDeviceInfo (devices[j], CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:      %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_MEM_BASE_ADDR_ALIGN:           %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:   %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:  %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:    %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:   %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:  %d\n", cluint);
	  clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof (cluint), &cluint, NULL);
	  printf ("    CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: %d\n", cluint);

	  clGetDeviceInfo (devices[j], CL_DEVICE_SINGLE_FP_CONFIG, sizeof (fpcfg), &fpcfg, NULL);
	  if (fpcfg & CL_FP_DENORM)
	    printf ("    CL_DEVICE_SINGLE_FP_CONFIG:              CL_FP_DENORM\n");
	  if (fpcfg & CL_FP_INF_NAN)
	    printf ("    CL_DEVICE_SINGLE_FP_CONFIG:              CL_FP_INF_NAN\n");
	  if (fpcfg & CL_FP_ROUND_TO_NEAREST)
	    printf ("    CL_DEVICE_SINGLE_FP_CONFIG:              CL_FP_ROUND_TO_NEAREST\n");
	  if (fpcfg & CL_FP_ROUND_TO_ZERO)
	    printf ("    CL_DEVICE_SINGLE_FP_CONFIG:              CL_FP_ROUND_TO_ZERO\n");

	  clGetDeviceInfo (devices[j], CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (xcap), &xcap, NULL);
	  if (xcap & CL_EXEC_KERNEL )
	    printf ("    CL_DEVICE_EXECUTION_CAPABILITIES:        CL_EXEC_KERNEL\n");
	  if (xcap & CL_EXEC_NATIVE_KERNEL)
	    printf ("    CL_DEVICE_EXECUTION_CAPABILITIES:        CL_EXEC_NATIVE_KERNEL\n");

	  clGetDeviceInfo (devices[j], CL_DEVICE_QUEUE_PROPERTIES, sizeof (qprops), &qprops, NULL);
	  if (qprops & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
	    printf ("    CL_DEVICE_QUEUE_PROPERTIES:              CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE\n");
	  if (qprops & CL_QUEUE_PROFILING_ENABLE)
	    printf ("    CL_DEVICE_QUEUE_PROPERTIES:              CL_QUEUE_PROFILING_ENABLE\n");

	  clGetDeviceInfo (devices[j], CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof (sizet), &sizet, NULL);
	  printf ("    CL_DEVICE_PROFILING_TIMER_RESOLUTION:    %d\n", sizet);

	  clGetDeviceInfo (devices[j], CL_DEVICE_COMPILER_AVAILABLE, sizeof (clbool), &clbool, NULL);
	  if (clbool == CL_TRUE)
	    printf ("    CL_DEVICE_COMPILER_AVAILABLE:            CL_TRUE\n");
	  else
	    printf ("    CL_DEVICE_COMPILER_AVAILABLE:            CL_FALSE\n");
	  clGetDeviceInfo (devices[j], CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof (clbool), &clbool, NULL);
	  if (clbool == CL_TRUE)
	    printf ("    CL_DEVICE_ERROR_CORRECTION_SUPPORT:      CL_TRUE\n");
	  else
	    printf ("    CL_DEVICE_ERROR_CORRECTION_SUPPORT:      CL_FALSE\n");

	  clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE_SUPPORT, sizeof (clbool), &clbool, NULL);
	  if (clbool == CL_FALSE)
	    {
	      printf ("    CL_DEVICE_IMAGE_SUPPORT:                 CL_FALSE\n");
	    }
	  else
	    {
	      printf ("    CL_DEVICE_IMAGE_SUPPORT:                 CL_TRUE\n");
	      clGetDeviceInfo (devices[j], CL_DEVICE_MAX_SAMPLERS, sizeof (cluint), &cluint, NULL);
	      printf ("    CL_DEVICE_MAX_SAMPLERS:                  %d\n", cluint);
	      clGetDeviceInfo (devices[j], CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof (cluint), &cluint, NULL);
	      printf ("    CL_DEVICE_MAX_READ_IMAGE_ARGS:           %d\n", cluint);
	      clGetDeviceInfo (devices[j], CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof (cluint), &cluint, NULL);
	      printf ("    CL_DEVICE_MAX_WRITE_IMAGE_ARGS:          %d\n", cluint);
	      clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof (sizet), &sizet, NULL);
	      printf ("    CL_DEVICE_IMAGE2D_MAX_WIDTH:             %d\n", sizet);
	      clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof (sizet), &sizet, NULL);
	      printf ("    CL_DEVICE_IMAGE2D_MAX_HEIGHT:            %d\n", sizet);
	      clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof (sizet), &sizet, NULL);
	      printf ("    CL_DEVICE_IMAGE3D_MAX_WIDTH:             %d\n", sizet);
	      clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof (sizet), &sizet, NULL);
	      printf ("    CL_DEVICE_IMAGE3D_MAX_HEIGHT:            %d\n", sizet);
	      clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof (sizet), &sizet, NULL);
	      printf ("    CL_DEVICE_IMAGE3D_MAX_DEPTH:             %d\n", sizet);
	    }
#undef PRINT_DEV_INFO
	} /* devices */
      free (devices);
    } /* platforms */
  free (s);
  free (platforms);
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program binary */
        unsigned char *bin;
        size_t bin_len;
        cl_int bin_ret;
        
        /* Read program binary */
        if (argc == 2)
                bin = read_buffer((char *)argv[1], &bin_len);
        else
        {
                printf("error: No binary specified\n");
                exit(1);
        }
        
        /* Create a program */
        cl_program program;
        program = clCreateProgramWithBinary(context, 1, &device, &bin_len, (const unsigned char **)&bin, &bin_ret, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithBinary' failed\n");
                exit(1);
        }
        if (bin_ret != CL_SUCCESS)
        {
                printf("error: Invalid binary for device\n");
                exit(1);
        }
        printf("program=%p\n", program);
        
        /* Free binary */
        free(bin);
        
        printf("program binary loaded\n");
        printf("\n");

        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object*/
        cl_kernel kernel;
        kernel = clCreateKernel(program, "hypot_float4float4", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_float4 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_float4));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_float4){{2.0, 2.0, 2.0, 2.0}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float4), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create and init host side src buffer 1 */
        cl_float4 *src_1_host_buffer;
        src_1_host_buffer = malloc(num_elem * sizeof(cl_float4));
        for (int i = 0; i < num_elem; i++)
                src_1_host_buffer[i] = (cl_float4){{2.0, 2.0, 2.0, 2.0}};
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float4), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_float4 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_float4));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float4));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float4), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float4));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Free host side src buffer 1 */
        free(src_1_host_buffer);

        /* Free device side src buffer 1 */
        ret = clReleaseMemObject(src_1_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
int main()
{
	srand(unsigned(time(nullptr)));
	int err;                            // error code returned from api calls

	cl_device_id device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_kernel kernel;                   // compute kernel

										// OpenCL device memory for matrices
	cl_mem d_A;
	cl_mem d_B;
	cl_mem d_C;

	// set seed for rand()
	srand(2014);

	//Allocate host memory for matrices A and B
	unsigned int size_A = WA * HA;
	unsigned int mem_size_A = sizeof(float) * size_A;
	float* h_A = (float*)malloc(mem_size_A);

	unsigned int size_B = WB * HB;
	unsigned int mem_size_B = sizeof(float) * size_B;
	float* h_B = (float*)malloc(mem_size_B);

	//Initialize host memory
	randomMemInit(h_A, size_A);
	randomMemInit(h_B, size_B);

	//Allocate host memory for the result C
	unsigned int size_C = WC * HC;
	unsigned int mem_size_C = sizeof(float) * size_C;
	float* h_C = (float*)malloc(mem_size_C);

	printf("Initializing OpenCL device...\n");

	cl_uint dev_cnt = 0;
	clGetPlatformIDs(0, 0, &dev_cnt);

	cl_platform_id platform_ids[100];
	clGetPlatformIDs(dev_cnt, platform_ids, NULL);

	// Connect to a compute device
	int gpu = 1;
	err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
	if (err != CL_SUCCESS){
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

	// Create a compute context 
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context){
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	// Create a command commands
	commands = clCreateCommandQueue(context, device_id, 0, &err);
	if (!commands){
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source file
	char *KernelSource;
	long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource);
	if (lFileSize < 0L){
		perror("File read failed");
		return 1;
	}
	//const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl");

	program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err);
	if (!program){
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS){
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		exit(1);
	}

	// Create the compute kernel in the program we wish to run
	kernel = clCreateKernel(program, "matrixMul", &err);
	if (!kernel || err != CL_SUCCESS){
		printf("Error: Failed to create compute kernel!\n");
		exit(1);
	}

	// Create the input and output arrays in device memory for our calculation
	d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err);
	d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err);
	d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err);

	if (!d_A || !d_B || !d_C){
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}

	printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB);

	//Launch OpenCL kernel
	size_t localWorkSize[2], globalWorkSize[2];

	int wA = WA;
	int wC = WC;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B);
	err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA);
	err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC);

	if (err != CL_SUCCESS){
		printf("Error: Failed to set kernel arguments! %d\n", err);
		exit(1);
	}

	localWorkSize[0] = 16;
	localWorkSize[1] = 16;
	globalWorkSize[0] = 1024;
	globalWorkSize[1] = 1024;

	err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to execute kernel! %d\n", err);
		exit(1);
	}

	//Retrieve result from device
	err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to read output array! %d\n", err);
		exit(1);
	}
	//print table A

	printf("\nMatrix A\n");
	for (int i = 0; i < size_A; i++){
		printf("%f\t", h_A[i]);
		if (((i + 1) % WA) == 0)
			printf("\n");
	}

	//print table B

	printf("\nMatrix B\n");
	for (int i = 0; i < size_B; i++){
		printf("%f\t", h_B[i]);
		if (((i + 1) % WB) == 0)
			printf("\n");
	}

	//print out the results

	printf("\nMatrix C (Results)\n");
	for (int i = 0; i < size_C; i++){
		printf("%f\t", h_C[i]);
		if (((i + 1) % WC) == 0)
			printf("\n");
	}
	printf("\n");


	printf("Matrix multiplication completed...\n");

	//Shutdown and cleanup
	free(h_A);
	free(h_B);
	free(h_C);

	clReleaseMemObject(d_A);
	clReleaseMemObject(d_C);
	clReleaseMemObject(d_B);

	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	std::cin.clear();
	std::cin.sync();
	std::cin.get();
}