Example #1
0
int main(int argc, char** argv)
{
  cl_platform_id pf[MAX_PLATFORMS];
  cl_uint nb_platforms = 0;
  cl_int err;                            // error code returned from api calls
  cl_device_type device_type = CL_DEVICE_TYPE_ALL;

  // Filter args
  //
  argv++;
  while (argc > 1) {
    if(!strcmp(*argv, "-g") || !strcmp(*argv, "--gpu-only")) {
      if(device_type != CL_DEVICE_TYPE_ALL)
	error("--gpu-only and --cpu-only can not be specified at the same time\n");
      device_type = CL_DEVICE_TYPE_GPU;
    } else if(!strcmp(*argv, "-c") || !strcmp(*argv, "--cpu-only")) {
      if(device_type != CL_DEVICE_TYPE_ALL)
	error("--gpu-only and --cpu-only can not be specified at the same time\n");
      device_type = CL_DEVICE_TYPE_CPU;
    } else if(!strcmp(*argv, "-s") || !strcmp(*argv, "--size")) {
      unsigned i;
      int r;
      char c;

      r = sscanf(argv[1], "%u%[mMkK]", &SIZE, &c);

      if (r == 2) {
	if (c == 'k' || c == 'K')
	  SIZE *= 1024;
	else if (c == 'm' || c == 'M')
	  SIZE *= 1024 * 1024;
      }

      argc--; argv++;
    } else
      break;
    argc--; argv++;
  }

  if(argc > 1)
    TILE = atoi(*argv);

  // Get list of OpenCL platforms detected
  //
  err = clGetPlatformIDs(3, pf, &nb_platforms);
  check(err, "Failed to get platform IDs");

  printf("%d OpenCL platforms detected\n", nb_platforms);

  // For each platform do
  //
  for (cl_int p = 0; p < nb_platforms; p++) {
    cl_uint num;
    int platform_valid = 1;
    char name[1024], vendor[1024];
    cl_device_id devices[MAX_DEVICES];
    cl_uint nb_devices = 0;
    cl_context context;                 // compute context
    cl_program program;                 // compute program
    cl_kernel kernel;

    err = clGetPlatformInfo(pf[p], CL_PLATFORM_NAME, 1024, name, NULL);
    check(err, "Failed to get Platform Info");

    err = clGetPlatformInfo(pf[p], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
    check(err, "Failed to get Platform Info");

    printf("Platform %d: %s - %s\n", p, name, vendor);

    // Get list of devices
    //
    err = clGetDeviceIDs(pf[p], device_type, MAX_DEVICES, devices, &nb_devices);
    printf("nb devices = %d\n", nb_devices);

    if(nb_devices == 0)
      continue;

    // Create compute context with "device_type" devices
    //
    context = clCreateContext (0, nb_devices, devices, NULL, NULL, &err);
    check(err, "Failed to create compute context");

    // Load program source into memory
    //
    const char	*opencl_prog;
    opencl_prog = file_load(KERNEL_FILE);

    // Attach program source to context
    //
    program = clCreateProgramWithSource(context, 1, &opencl_prog, NULL, &err);
    check(err, "Failed to create program");

    // Compile program
    //
    {
      char flags[1024];

      sprintf (flags,
	       "-cl-mad-enable -cl-fast-relaxed-math -DSIZE=%d -DTILE=%d -DTYPE=%s",
	       SIZE, TILE, "float");

      err = clBuildProgram (program, 0, NULL, flags, NULL, NULL);
      if(err != CL_SUCCESS) {
	size_t len;

	// Display compiler log
	//
	clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
	{
	  char buffer[len+1];

	  fprintf(stderr, "--- Compiler log ---\n");
	  clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
	  fprintf(stderr, "%s\n", buffer);
	  fprintf(stderr, "--------------------\n");
	}
	if(err != CL_SUCCESS)
	  error("Failed to build program!\n");
      }
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, KERNEL_NAME, &err);
    check(err, "Failed to create compute kernel");

    // Allocate and initialize input data
    //
    alloc_buffers_and_user_data(context);

    // Iterate over devices
    //
    for(cl_int dev = 0; dev < nb_devices; dev++) {
      cl_command_queue queue;

      char name[1024];
      cl_device_type dtype;

      err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, 1024, name, NULL);
      check(err, "Cannot get type of device");
      err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &dtype, NULL);
      check(err, "Cannot get type of device");

      printf("\tDevice %d : %s [%s]\n", dev, (dtype == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU", name);

      // Create a command queue
      //
      queue = clCreateCommandQueue(context, devices[dev], CL_QUEUE_PROFILING_ENABLE, &err);
      check(err,"Failed to create command queue");

      // Write our data set into device buffer
      //
      send_input(queue);

      // Execute kernel
      //
      {
	cl_event prof_event;
	cl_ulong start, end;
	struct timeval t1,t2;
	double timeInMicroseconds;
	size_t global[2] = { SIZE, SIZE };  // global domain size for our calculation
	size_t local[2]  = { TILE, TILE };   // local domain size for our calculation

	printf("\t%dx%d Threads in workgroups of %dx%d\n", global[0], global[1], local[0], local[1]);

	// Set kernel arguments
	//
	err = 0;
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_buffer);
	check(err, "Failed to set kernel arguments");

	gettimeofday (&t1, NULL);

	for (unsigned iter = 0; iter < ITERATIONS; iter++) {
	  err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local,
				       0, NULL, &prof_event);
	  check(err, "Failed to execute kernel");
	}

	// Wait for the command commands to get serviced before reading back results
	//
	clFinish(queue);

	gettimeofday (&t2,NULL);

	// Check performance
	//
	timeInMicroseconds = (double)TIME_DIFF(t1, t2) / ITERATIONS;

	printf("\tComputation performed in %lf µs over device #%d\n",
	       timeInMicroseconds,
	       dev);

	clReleaseEvent(prof_event);
      }

      // Read back the results from the device to verify the output
      //
      retrieve_output(queue);

      // Validate computation
      //
      check_output_data();

      clReleaseCommandQueue(queue);
    }

    // Cleanup
    //
    free_buffers_and_user_data();

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseContext(context);
  }


  return 0;
}
Example #2
0
// Example use of the double-precision Xtrsm routine DTRSM, solving A*X = alpha*B, storing the
// result in the memory of matrix B. Uses row-major storage (C-style).
int main() {

  // OpenCL platform/device settings
  const auto platform_id = 0;
  const auto device_id = 0;

  // Example TRSM arguments
  const size_t m = 4;
  const size_t n = 3;
  const double alpha = 1.0;
  const auto a_ld = m;
  const auto b_ld = n;

  // Initializes the OpenCL platform
  auto platforms = std::vector<cl::Platform>();
  cl::Platform::get(&platforms);
  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
  auto platform = platforms[platform_id];

  // Initializes the OpenCL device
  auto devices = std::vector<cl::Device>();
  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
  auto device = devices[device_id];

  // Creates the OpenCL context, queue, and an event
  auto device_as_vector = std::vector<cl::Device>{device};
  auto context = cl::Context(device_as_vector);
  auto queue = cl::CommandQueue(context, device);
  auto event = cl_event{nullptr};

  // Populate host matrices with some example data
  auto host_a = std::vector<double>({1.0,  2.0,  1.0, -2.0,
                                    0.0, -1.0, -2.0,  0.0,
                                    0.0,  0.0,  1.0,  1.0,
                                    0.0,  0.0,  0.0, -1.0});
  auto host_b = std::vector<double>({-1.0, -1.0,  3.0,
                                     1.0, -3.0,  2.0,
                                     1.0,  1.0, -1.0,
                                     4.0, -1.0, -2.0});
  // Expected result:
  //   8 -5  2
  // -11  3  4
  //   5  0 -3
  //  -4  1  2

  // Copy the matrices to the device
  auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(double));
  auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(double));
  queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(double), host_a.data());
  queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data());

  // Call the DTRSM routine. Note that the type of alpha and beta (double) determine the precision.
  auto queue_plain = queue();
  auto status = clblast::Trsm(clblast::Layout::kRowMajor, clblast::Side::kLeft,
                              clblast::Triangle::kUpper, clblast::Transpose::kNo,
                              clblast::Diagonal::kNonUnit,
                              m, n,
                              alpha,
                              device_a(), 0, a_ld,
                              device_b(), 0, b_ld,
                              &queue_plain, &event);

  // Retrieves the results
  if (status == clblast::StatusCode::kSuccess) {
    clWaitForEvents(1, &event);
    clReleaseEvent(event);
  }
  queue.enqueueReadBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data());

  // Example completed. See "clblast.h" for status codes (0 -> success).
  printf("Completed TRSM with status %d and results:\n", static_cast<int>(status));
  for (auto i = size_t{0}; i < m; ++i) {
    for (auto j = size_t{0}; j < n; ++j) {
      printf("%3.0f ", host_b[i * b_ld + j]);
    }
    printf("\n");
  }
  return 0;
}
/**
 * \related cl_Mem_Object_t
 *
 * This fucntion unmaps previously mapped memory for OpenCL buffer.
 *
 * @param[in,out] self  pointer to structure, in which 'Unmap' function pointer
 * is defined to point on this function.
 * @param[out] p_mapped_ptr pointer to pointer, that was returned as the
 * result of mapping operation.
 * @param[in] time_mode enumeration, that denotes how time measurement should be
 * performed.
 * @param[out] evt_to_generate pointer to OpenCL event that will be generated
 * at the end of operation.
 *
 * @return CL_SUCCESS in case of success, error code of type 'ret_code' otherwise.
 *
 * @see cl_err_codes.h for detailed error description.
 * @see 'cl_Error_t' structure for error handling.
 */
static ret_code Mem_Object_Unmap(
    scow_Mem_Object         *self, 
    cl_bool                 blocking_map,
    void                    **p_mapped_ptr, 
    TIME_STUDY_MODE         time_mode,
    cl_event                *evt_to_generate, 
    cl_command_queue        explicit_queue)
{
    cl_int ret;
    cl_event *p_unmapping_ready;

    OCL_CHECK_EXISTENCE(self, INVALID_BUFFER_GIVEN);
    OCL_CHECK_EXISTENCE(self->mapped_to_region, MEM_OBJ_NOT_MAPPED);

    if (p_mapped_ptr)
    {
        OCL_CHECK_EXISTENCE(*p_mapped_ptr, INVALID_BUFFER_GIVEN);

        /* Check if we are trying to unmap pointer, that was mapped from different
         * Memory Object. */
        if (self->mapped_to_region != *p_mapped_ptr)
        {
            OCL_DIE_ON_ERROR(WRONG_PARENT_OBJECT, CL_SUCCESS, NULL,
                    WRONG_PARENT_OBJECT);
        }
    }

    /* We generate event in any case - because later we may want to wait for
     * unmapping completion. */
    (evt_to_generate != NULL) ?
            (p_unmapping_ready = evt_to_generate) :
            (p_unmapping_ready = &self->unmap_evt);

    cl_command_queue q =
            (explicit_queue == NULL) ?
                    (self->parent_thread->q_data_htod) : (explicit_queue);

    ret = clEnqueueUnmapMemObject(q, self->cl_mem_object,
            self->mapped_to_region, 0, NULL, p_unmapping_ready);

    OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, ret);

    self->mapped_to_region = NULL;
    self->row_pitch = 0;

    if (p_mapped_ptr != NULL)
    {
        *p_mapped_ptr = NULL;
    }

    switch (time_mode)
    {
    case MEASURE:
        self->timer->current_time_device = Gather_Time_uS(p_unmapping_ready);
        self->timer->total_time_device += self->timer->current_time_device;
        break;

    case DONT_MEASURE:
        break;

    default:
        if (blocking_map)
        {
            ret = clWaitForEvents(1, p_unmapping_ready);
            OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, ret);
        }
        break;
    }

    if (p_unmapping_ready != evt_to_generate){
        clReleaseEvent(*p_unmapping_ready);
    }

    return ret;
}
Example #4
0
void run1(int N, char *fileName)
{
	puts("Matrix Vector Multiplication Naive\n");

	int i,j;

	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = 1.f;	
		}
	}

	float *B;
	B = (float*)malloc(sizeof(float)*N);
	for( i = 0; i < N ; ++i )
	{
		B[i] = 1.f;	
	}
	
	float *C;
	C = (float*)malloc(sizeof(float)*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);

	puts("B");
	check_1d_f(B,N);
#endif

	int NumK = 1;
	int NumE = 1;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);

#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bin, NULL);
	OCL_CHECK(err);

	// Print the binary out to the output file
	fh = fopen("kernel_mv_1.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

#endif

	kernel[0] = clCreateKernel(program, "mv_1", &err);
	OCL_CHECK(err);


	// memory on device
	cl_mem A_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem B_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,  NULL, NULL);
	cl_mem C_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,  NULL, NULL);

	// Initialize device memory
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , NULL); 
	OCL_CHECK(err);
	err = clEnqueueWriteBuffer(queue, B_d, 	CL_TRUE, 0, sizeof(float)*N, 	B, 0, NULL , NULL); 
	OCL_CHECK(err);

	size_t localsize =  64;
	size_t globalsize = N;


	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &C_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 3, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &globalsize, &localsize, 0, NULL, &event[0]);
	OCL_CHECK(err);

	clFinish(queue);

	clEnqueueReadBuffer(queue, C_d, CL_TRUE, 0, sizeof(float)*N, C , 0, NULL , NULL );

	err = clWaitForEvents(1,&event[0]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("C = A * B");
	check_1d_f(C,N);

#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(B_d);	
	clReleaseMemObject(C_d);	



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif

	free(A);
	free(B);
	free(C);

	return;
}
Example #5
0
cl_mem parallelRemap1( cl_mem a_buffer, cl_mem v_buffer, cl_mem b_buffer, uint asize, uint bsize, real max_a, real min_val, real min_diff, double *time ) {
    
    cl_int error = 0;
    
    uint temp_size = (uint)((max_a - min_val)/min_diff);
    
    cl_mem temp_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, temp_size*sizeof(int), NULL, &error);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    size_t global_work_size[1];
    size_t local_work_size[1];
    
    local_work_size[0] = TILE_SIZE;
    global_work_size[0] = ((asize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
    
    /******************
     * Hash Kernel
     ******************/
    
    error = clSetKernelArg(cHash_kernel, 0, sizeof(real), &min_val);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(cHash_kernel, 1, sizeof(real), &min_diff);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(cHash_kernel, 2, sizeof(cl_uint), &asize);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(cHash_kernel, 3, sizeof(cl_mem), (void*)&a_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(cHash_kernel, 4, sizeof(cl_mem), (void*)&temp_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    global_work_size[0] = ((asize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
    
    cl_event hash_kernel_event;
    
    error = clEnqueueNDRangeKernel(queue, cHash_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_kernel_event);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    /*****************
     * Remap Kernel
     *****************/
    
    cl_mem remap_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, bsize*sizeof(real), NULL, &error);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    error = clSetKernelArg(remap1_kernel, 0, sizeof(real), &min_val);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 1, sizeof(real), &min_diff);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 2, sizeof(cl_uint), &temp_size);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 3, sizeof(cl_uint), &bsize);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 4, sizeof(cl_mem), (void*)&a_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 5, sizeof(cl_mem), (void*)&v_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 6, sizeof(cl_mem), (void*)&b_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 7, sizeof(cl_mem), (void*)&temp_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 8, sizeof(cl_mem), (void*)&remap_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    global_work_size[0] = ((bsize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];

    cl_event remap_event;
    
    error = clEnqueueNDRangeKernel(queue, remap1_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &remap_event);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
        
    long gpu_time_start, gpu_time_end, gpu_time=0;
    
    clWaitForEvents(1, &remap_event);
    
    clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
    clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
    gpu_time += gpu_time_end - gpu_time_start;
    clReleaseEvent(hash_kernel_event);
        
    clGetEventProfilingInfo(remap_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
    clGetEventProfilingInfo(remap_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
    gpu_time += gpu_time_end - gpu_time_start;
    clReleaseEvent(remap_event);
    
    clReleaseMemObject(temp_buffer);

    *time = gpu_time*1.0e-9;
    
    return remap_buffer;

}
int main() {

   /* OpenCL data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel;
   cl_int i, err, num_vectors;

   /* Data and events */
   char data[NUM_BYTES];
   cl_mem data_buffer;
   cl_event prof_event;
   cl_ulong time_start, time_end, total_time;
   void* mapped_memory;

   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }     

   /* Build the program and create a kernel */
   program = build_program(context, device, PROGRAM_FILE);
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);   
   };

   /* Create a buffer to hold data */
   data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
         sizeof(data), NULL, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };         

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
   if(err < 0) {
      perror("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Tell kernel number of char16 vectors */
   num_vectors = NUM_BYTES/16;
   clSetKernelArg(kernel, 1, sizeof(num_vectors), &num_vectors);

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 
         CL_QUEUE_PROFILING_ENABLE, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   total_time = 0.0f;
   for(i=0; i<NUM_ITERATIONS; i++) {

      /* Enqueue kernel */
      err = clEnqueueTask(queue, kernel, 0, NULL, NULL);
      if(err < 0) {
         perror("Couldn't enqueue the kernel");
         exit(1);   
      }

#ifdef PROFILE_READ

      /* Read the buffer */
      err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
            sizeof(data), data, 0, NULL, &prof_event);
      if(err < 0) {
         perror("Couldn't read the buffer");
         exit(1);
      }

#else

      /* Create memory map */
      mapped_memory = clEnqueueMapBuffer(queue, data_buffer, CL_TRUE,
            CL_MAP_READ, 0, sizeof(data), 0, NULL, &prof_event, &err);
      if(err < 0) {
         perror("Couldn't map the buffer to host memory");
         exit(1);   
      }

#endif

      /* Get profiling information */
      clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
            sizeof(time_start), &time_start, NULL);
      clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
            sizeof(time_end), &time_end, NULL);
      total_time += time_end - time_start;

#ifndef PROFILE_READ

      /* Unmap the buffer */
      err = clEnqueueUnmapMemObject(queue, data_buffer, mapped_memory,
            0, NULL, NULL);
      if(err < 0) {
         perror("Couldn't unmap the buffer");
         exit(1);   
      }

#endif
   }

#ifdef PROFILE_READ
   printf("Average read time: %lu\n", total_time/NUM_ITERATIONS);
#else
   printf("Average map time: %lu\n", total_time/NUM_ITERATIONS);
#endif

   /* Deallocate resources */
   clReleaseEvent(prof_event);
   clReleaseMemObject(data_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
Example #7
0
/*------------------------------------------------------
 ** ForwardSub() -- Forward substitution of Gaussian
 ** elimination.
 **------------------------------------------------------
 */
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing){    
	// 1. set up kernels
	cl_kernel fan1_kernel,fan2_kernel;
	cl_int status=0;
	cl_program gaussianElim_program;
	cl_event writeEvent,kernelEvent,readEvent;
	float writeTime=0,readTime=0,kernelTime=0;
	float writeMB=0,readMB=0;

	gaussianElim_program = cl_compileProgram(
			(char *)"gaussianElim_kernels.cl",NULL);

	fan1_kernel = clCreateKernel(
			gaussianElim_program, "Fan1", &status);
	status = cl_errChk(status, (char *)"Error Creating Fan1 kernel",true);
	if(status)exit(1);

	fan2_kernel = clCreateKernel(
			gaussianElim_program, "Fan2", &status);
	status = cl_errChk(status, (char *)"Error Creating Fan2 kernel",true);
	if(status)exit(1);

	// 2. set up memory on device and send ipts data to device

	cl_mem a_dev, b_dev, m_dev;

	cl_int error=0;

	a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float)*size*size, NULL, &error);

	b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float)*size, NULL, &error);

	m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float) * size * size, NULL, &error);

	command_queue = cl_getCommandQueue();

	error = clEnqueueWriteBuffer(command_queue,
			a_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size*size,
			a,
			0,
			NULL,
			&writeEvent);

	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);

	error = clEnqueueWriteBuffer(command_queue,
			b_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size,
			b,
			0,
			NULL,
			&writeEvent);
	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);

	error = clEnqueueWriteBuffer(command_queue,
			m_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size*size,
			m,
			0,
			NULL,
			&writeEvent);
	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);
	writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);

	// 3. Determine block sizes
	size_t globalWorksizeFan1[1];
	size_t globalWorksizeFan2[2];
	size_t localWorksizeFan1Buf[1]={BLOCK_SIZE_0};
	size_t localWorksizeFan2Buf[2]={BLOCK_SIZE_1_X, BLOCK_SIZE_1_Y};
	size_t *localWorksizeFan1=NULL;
	size_t *localWorksizeFan2=NULL;

	globalWorksizeFan1[0] = size;
	globalWorksizeFan2[0] = size;
	globalWorksizeFan2[1] = size;

	if(localWorksizeFan1Buf[0]){
		localWorksizeFan1=localWorksizeFan1Buf;
		globalWorksizeFan1[0]=(int)ceil(globalWorksizeFan1[0]/(double)localWorksizeFan1Buf[0])*localWorksizeFan1Buf[0];
	}
	if(localWorksizeFan2Buf[0]){
		localWorksizeFan2=localWorksizeFan2Buf;
		globalWorksizeFan2[0]=(int)ceil(globalWorksizeFan2[0]/(double)localWorksizeFan2Buf[0])*localWorksizeFan2Buf[0];
		globalWorksizeFan2[1]=(int)ceil(globalWorksizeFan2[1]/(double)localWorksizeFan2Buf[1])*localWorksizeFan2Buf[1];
	}

	int t;
	// 4. Setup and Run kernels
	for (t=0; t<(size-1); t++) {
		// kernel args
		cl_int argchk;
		argchk  = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
		argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
		argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
		argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size);
		argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t);

		cl_errChk(argchk,"ERROR in Setting Fan1 kernel args",true);

		//printf("localWorksizeFan1:%u, globalWorksizeFan1:%u\n", localWorksizeFan1Buf[0], globalWorksizeFan1[0]);	
#pragma dividend local_work_group_size localWorksizeFan1 dim 1 dim1(2:64:2:64)
	//This lws will be used to profile the OpenCL kernel with id 1
			size_t _dividend_lws_localWorksizeFan1_k1[2];
		{
		_dividend_lws_localWorksizeFan1_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL);
		//Dividend extension: store the kernel id as the last element
		_dividend_lws_localWorksizeFan1_k1[1] = 1;
		}
				// launch kernel
		error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(
				command_queue,  fan1_kernel, 1, 0,
				globalWorksizeFan1, _dividend_lws_localWorksizeFan1_k1,
				0, NULL, NULL);

		cl_errChk(error,"ERROR in Executing Fan1 Kernel",true);

		//fprintf(stderr, "AFTER THIS\n");

		argchk  = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
		argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
		argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
		argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size);
		argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t);

		cl_errChk(argchk,"ERROR in Setting Fan2 kernel args",true);

		size_t local_work_size[] = {128, 128};

		//printf("localWorksizeFan2:%u, globalWorksizeFan2[0]:%u, globalWorksizeFan2[1]:%u\n", localWorksizeFan2Buf[0], globalWorksizeFan2[0], globalWorksizeFan2[1]);	
#pragma dividend local_work_group_size local_work_size dim 2 dim1(8:64:2:64) dim2(8:64:2:64)
	//This lws will be used to profile the OpenCL kernel with id 2
			size_t _dividend_lws_local_work_size_k2[3];
		{
		_dividend_lws_local_work_size_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL);
		_dividend_lws_local_work_size_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL);
		//Dividend extension: store the kernel id as the last element
		_dividend_lws_local_work_size_k2[2] = 2;
		}
				// launch kernel
		error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(
				command_queue,  fan2_kernel, 2, 0,
				globalWorksizeFan2, _dividend_lws_local_work_size_k2,
				0, NULL, NULL);

		cl_errChk(error,"ERROR in Executing Fan2 Kernel",true);
	
		if (timing) {
			//             printf("here2a\n");
			//             kernelTime+=eventTime(kernelEvent,command_queue);
			//             printf("here2b\n");
		}
		clReleaseEvent(kernelEvent);
		//Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t);
		//cudaThreadSynchronize();
	}
	// 5. transfer data off of device
	error = clEnqueueReadBuffer(command_queue,
			a_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size * size,
			a,
			0,
			NULL,
			&readEvent);

	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);

	error = clEnqueueReadBuffer(command_queue,
			b_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size,
			b,
			0,
			NULL,
			&readEvent);
	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);

	error = clEnqueueReadBuffer(command_queue,
			m_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size * size,
			m,
			0,
			NULL,
			&readEvent);

	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);
	readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);

	if (timing) {
		printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s)  [size]\t\tTotal(s)\n");
		printf("%dx%d      \t",size,size);

		printf("%f [%.2fMB]\t",writeTime,writeMB);


		printf("%f\t",kernelTime);


		printf("%f [%.2fMB]\t",readTime,readMB);

		printf("%f\n\n",writeTime+kernelTime+readTime);
	}

}
Example #8
0
int main(int argc, char *argv[])
{
	// selected platform and device number
	cl_uint pn = 0, dn = 0;

	// OpenCL error
	cl_int error;

	// generic iterator
	cl_uint i;

	// major/minor version of the platform OpenCL version
	cl_uint ocl_major, ocl_minor;

	// set platform/device num from command line
	if (argc > 1)
		pn = atoi(argv[1]);
	if (argc > 2)
		dn = atoi(argv[2]);

	error = clGetPlatformIDs(0, NULL, &np);
	CHECK_ERROR("getting amount of platform IDs");
	printf("%u platforms found\n", np);
	if (pn >= np) {
		fprintf(stderr, "there is no platform #%u\n" , pn);
		exit(1);
	}
	// only allocate for IDs up to the intended one
	platform = calloc(pn+1,sizeof(*platform));
	// if allocation failed, next call will bomb. rely on this
	error = clGetPlatformIDs(pn+1, platform, NULL);
	CHECK_ERROR("getting platform IDs");

	// choose platform
	p = platform[pn];

	error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting platform name");
	printf("using platform %u: %s\n", pn, strbuf);

	error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting platform version");
	// we need 1.2 at least
	i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor);
	if (i != 2) {
		fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n",
			__func__, __LINE__);
		exit(1);
	}
	if (ocl_major == 1 && ocl_minor < 2) {
		fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n",
			__func__, __LINE__, strbuf);
		exit(1);
	}

	error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
	CHECK_ERROR("getting amount of device IDs");
	printf("%u devices found\n", nd);
	if (dn >= nd) {
		fprintf(stderr, "there is no device #%u\n", dn);
		exit(1);
	}
	// only allocate for IDs up to the intended one
	device = calloc(dn+1,sizeof(*device));
	// if allocation failed, next call will bomb. rely on this
	error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL);
	CHECK_ERROR("getting device IDs");

	// choose device
	d = device[dn];
	error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting device name");
	printf("using device %u: %s\n", dn, strbuf);

	error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE,
			sizeof(gmem), &gmem, NULL);
	CHECK_ERROR("getting device global memory size");
	error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
			sizeof(alloc_max), &alloc_max, NULL);
	CHECK_ERROR("getting device max memory allocation size");

	// create context
	ctx_prop[1] = (cl_context_properties)p;
	ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error);
	CHECK_ERROR("creating context");

	// create queue
	q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error);
	CHECK_ERROR("creating queue");

	// create program
	pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error);
	CHECK_ERROR("creating program");

	// build program
	error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL);
	CHECK_ERROR("building program");

	// get kernel
	k = clCreateKernel(pg, "add", &error);
	CHECK_ERROR("creating kernel");

	error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
			sizeof(wgm), &wgm, NULL);
	CHECK_ERROR("getting preferred workgroup size multiple");

	// number of elements on which kernel will be launched. it's ok if we don't
	// cover every byte of the buffers
	nels = alloc_max/sizeof(cl_float);

	gws = ROUND_MUL(nels, wgm);

	printf("will use %zu workitems grouped by %zu to process %u elements\n",
			gws, wgm, nels);

	// we will try and allocate at least one buffer more than needed to fill
	// the device memory, and no less than 3 anyway
	nbuf = gmem/alloc_max + 1;
	if (nbuf < 3)
		nbuf = 3;

#define MB (1024*1024.0)

	printf("will try allocating %u host buffers of %gMB each to overcommit %gMB\n",
			nbuf, alloc_max/MB, gmem/MB);

	hostbuf = calloc(nbuf, sizeof(cl_mem));

	if (!hostbuf) {
		fprintf(stderr, "could not prepare support for %u buffers\n", nbuf);
		exit(1);
	}

	// allocate ‘host’ buffers
	for (i = 0; i < nbuf; ++i) {
		hostbuf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, alloc_max,
				NULL, &error);
		CHECK_ERROR("allocating host buffer");
		printf("host buffer %u allocated\n", i);
		error = clEnqueueMigrateMemObjects(q, 1, hostbuf + i,
				CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED,
				0, NULL, NULL);
		CHECK_ERROR("migrating buffer to host");
		printf("buffer %u migrated to host\n", i);
	}

	// allocate ‘device’ buffers
	for (i = 0; i < 2; ++i) {
		devbuf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_max,
				NULL, &error);
		CHECK_ERROR("allocating devbuffer");
		printf("dev buffer %u allocated\n", i);
		if (i == 0) {
			float patt = 0;
			error = clEnqueueFillBuffer(q, devbuf[0], &patt, sizeof(patt),
					0, nels*sizeof(patt), 0, NULL, &mem_evt);
			CHECK_ERROR("enqueueing memset");
		}
	}
	error = clWaitForEvents(1, &mem_evt);
	CHECK_ERROR("waiting for buffer fill");
	clReleaseEvent(mem_evt); mem_evt = NULL;

	// use the buffers
	for (i = 0; i < nbuf; ++i) {
		printf("testing buffer %u\n", i);

		// for each buffer, we do a setup on CPU and then use it as second
		// argument for the kernel
		hbuf = clEnqueueMapBuffer(q, hostbuf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
				0, alloc_max, 0, NULL, NULL, &error);
		CHECK_ERROR("mapping buffer");
		for (e = 0; e < nels; ++e)
			hbuf[e] = i;
		error = clEnqueueUnmapMemObject(q, hostbuf[i], hbuf, 0, NULL, NULL);
		CHECK_ERROR("unmapping buffer");
		hbuf = NULL;

		// copy ‘host’ to ‘device’ buffer
		clEnqueueCopyBuffer(q, hostbuf[i], devbuf[1], 0, 0, alloc_max,
				0, NULL, NULL);
		// make sure all pending actions are completed
		error =	clFinish(q);
		CHECK_ERROR("settling down");

		clSetKernelArg(k, 0, sizeof(cl_mem), devbuf);
		clSetKernelArg(k, 1, sizeof(cl_mem), devbuf + 1);
		clSetKernelArg(k, 2, sizeof(nels), &nels);
		error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm,
				0, NULL, &krn_evt);
		CHECK_ERROR("enqueueing kernel");

		error = clEnqueueCopyBuffer(q, devbuf[0], hostbuf[0],
				0, 0, alloc_max, 1, &krn_evt, &mem_evt);
		CHECK_ERROR("copying data to host");

		expected = i*(i+1)/2.0f;
		hbuf = clEnqueueMapBuffer(q, hostbuf[0], CL_TRUE, CL_MAP_READ,
				0, alloc_max, 1, &mem_evt, NULL, &error);
		CHECK_ERROR("mapping buffer 0");
		for (e = 0; e < nels; ++e)
			if (hbuf[e] != expected) {
				fprintf(stderr, "mismatch @ %u: %g instead of %g\n",
						e, hbuf[e], expected);
				exit(1);
			}
		error = clEnqueueUnmapMemObject(q, hostbuf[0], hbuf, 0, NULL, NULL);
		CHECK_ERROR("unmapping buffer 0");
		hbuf = NULL;
		clReleaseEvent(krn_evt);
		clReleaseEvent(mem_evt);
		krn_evt = mem_evt = NULL;
	}

	for (i = 1; i <= 2; ++i) {
		clReleaseMemObject(devbuf[2 - i]);
		printf("dev buffer %u freed\n", nbuf  - i);
	}
	for (i = 1; i <= nbuf; ++i) {
		clReleaseMemObject(hostbuf[nbuf - i]);
		printf("host buffer %u freed\n", nbuf  - i);
	}

	return 0;
}
void cpu_to_opencl_opencl_func(void *buffers[], void *args)
{
	STARPU_SKIP_IF_VALGRIND;

	(void) args;
	int id, devid, ret;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
	cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
	cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION,
						  &opencl_conversion_program,
						  NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");

	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_conversion_program,
					"cpu_to_opencl_opencl",
					devid);

	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(src), &src);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(dst), &dst);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 2, sizeof(n), &n);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
        ret = starpu_opencl_unload_opencl(&opencl_conversion_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
}
Example #10
0
void mat_mul_opencl_1d(float *M_A, float *M_B, float *M_C,
                       size_t ROW_A, size_t COL_A, size_t COL_B) {
  cl_platform_id   *platform;
  cl_device_type   dev_type;
  cl_device_id     dev;
  cl_context       context;
  cl_command_queue cmd_queue;
  cl_program       program;
  cl_kernel        kernel;
  cl_mem           mem_A, mem_B, mem_C;
  cl_event         ev_kernel;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_dev = 0;
  int i;

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }
  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);

  // Device
  dev_type = get_device_type();
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
    if (num_dev == 1) break;
  }
  if (num_dev < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }
  print_device_name(dev);
  free(platform);

  // Context
  context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Command queue
  cmd_queue = clCreateCommandQueue(context, dev,
                                   CL_QUEUE_PROFILING_ENABLE,
                                   &err);
  CHECK_ERROR(err);

  // Create a program.
  char *source_code = get_source_code("./kernel_1d.cl");
  program = clCreateProgramWithSource(context,
                                      1, (const char **)&source_code,
                                      NULL, &err);
  free(source_code);
  CHECK_ERROR(err);

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu",
          ROW_A, COL_A, COL_B);
  err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL);
  if (err != CL_SUCCESS) {
    print_build_log(program, dev);
    CHECK_ERROR(err);
  }

  // Kernel
  kernel = clCreateKernel(program, "mat_mul", &err);
  CHECK_ERROR(err);

  // Buffers
  mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * ROW_A * COL_A,
                         M_A, &err);
  CHECK_ERROR(err);

  mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY, 
                         sizeof(float) * COL_A * COL_B,
                         NULL, &err);
  CHECK_ERROR(err);
  err = clEnqueueWriteBuffer(cmd_queue,
                             mem_B,
                             CL_FALSE, 0,
                             sizeof(float) * COL_A * COL_B,
                             M_B,
                             0, NULL, NULL);
  CHECK_ERROR(err)

  mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, 
                         sizeof(float) * ROW_A * COL_B,
                         NULL, &err);
  CHECK_ERROR(err);

  // Set the arguments.
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C);
  CHECK_ERROR(err);

  // Enqueue the kernel.
  size_t lws[1] = {256};
  size_t gws[1];
  gws[0] = (size_t)ceil((double)ROW_A / lws[0]) * lws[0];
  err = clEnqueueNDRangeKernel(cmd_queue,
                               kernel,
                               1, NULL,
                               gws, lws,
                               0, NULL,
                               &ev_kernel);
  CHECK_ERROR(err);

  // Read the result.
  err = clEnqueueReadBuffer(cmd_queue,
                            mem_C,
                            CL_TRUE, 0,
                            sizeof(float) * ROW_A * COL_B, 
                            M_C,
                            0, NULL, NULL);
  CHECK_ERROR(err);

  // Read the profiling info.
  cl_ulong start_time, end_time;
  err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_START, 
                                sizeof(cl_ulong), &start_time, NULL);
  CHECK_ERROR(err);
  err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_END, 
                                sizeof(cl_ulong), &end_time, NULL);
  CHECK_ERROR(err);
  printf("Kernel time : %lf sec\n", (double)(end_time - start_time) / 10e9);

  // Release
  clReleaseEvent(ev_kernel);
  clReleaseMemObject(mem_A);
  clReleaseMemObject(mem_B);
  clReleaseMemObject(mem_C);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);
}
void mandelbrot(int m, int n)
{
  cl_platform_id   *platform;
  cl_device_type   dev_type = CL_DEVICE_TYPE_GPU;
  cl_device_id     *devs = NULL;
  cl_context       context;
  cl_command_queue *cmd_queues;
  cl_program       program;
  cl_kernel        *kernels;
  cl_mem           *mem_R;
  cl_mem		   *mem_G;
  cl_mem		   *mem_B;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_devs = 0;
  cl_event		   *ev_kernels;

	
		
  int count_max = COUNT_MAX;
  int i, j, jhi, jlo;
  char *output_filename = "mandelbrot.ppm";
  FILE *output_unit;
  double wtime;

  float x_max =   1.25;
  float x_min = - 2.25;
//  float x;
//  float x1;
//  float x2;
  float y_max =   1.75;
  float y_min = - 1.75;
  //float y;
  //float y1;
  //float y2;

  size_t size_color;

  size_color = sizeof(int) * m * n;

  int (*r)[n] = (int (*)[n])calloc(m * n, sizeof(int));
  int (*g)[n] = (int (*)[n])calloc(m * n, sizeof(int));
  int (*b)[n] = (int (*)[n])calloc(m * n, sizeof(int));

  printf( "  Sequential C version\n" );
  printf( "\n" );
  printf( "  Create an ASCII PPM image of the Mandelbrot set.\n" );
  printf( "\n" );
  printf( "  For each point C = X + i*Y\n" );
  printf( "  with X range [%g,%g]\n", x_min, x_max );
  printf( "  and  Y range [%g,%g]\n", y_min, y_max );
  printf( "  carry out %d iterations of the map\n", count_max );
  printf( "  Z(n+1) = Z(n)^2 + C.\n" );
  printf( "  If the iterates stay bounded (norm less than 2)\n" );
  printf( "  then C is taken to be a member of the set.\n" );
  printf( "\n" );
  printf( "  An ASCII PPM image of the set is created using\n" );
  printf( "    M = %d pixels in the X direction and\n", m );
  printf( "    N = %d pixels in the Y direction.\n", n );

  timer_init();
  timer_start(0);

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }

  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);
  
  // Device
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
	num_devs = 1; //**
    if (num_devs >= 1)
	{
		devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs);

		err = clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL);
		break;
	}
  }
  if ( devs == NULL || num_devs < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }

  for( i = 0; i < num_devs; ++i ) {
	printf("dev[%d] : ", i);
  	print_device_name(devs[i]);
  }

  // Context
  context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Command queue
  cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs);
  for( i = 0; i < num_devs; ++i) {
	cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err);
  	CHECK_ERROR(err);
  }

  // Create a program.
  size_t source_len;
  char *source_code = get_source_code("./mandelbrot_kernel.cl", &source_len);
  program = clCreateProgramWithSource(context,
                                      1,
                                      (const char **)&source_code,
                                      &source_len,
                                      &err);
  free(source_code);
  CHECK_ERROR(err);

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-Dm=%d -Dn=%d -Dnum_devs=%d", m, n, num_devs);
  err = clBuildProgram(program, num_devs, devs, build_opts, NULL, NULL);
  if (err != CL_SUCCESS) {
    print_build_log(program, devs[0]);
    CHECK_ERROR(err);
  }
  
  // Kernel
  kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs);
  for (i = 0; i < num_devs; i++) {
	  kernels[i] = clCreateKernel(program, "mandelbrot_kernel", NULL);
  }
 
  // Buffers  
  mem_R = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);
  mem_G = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);
  mem_B = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);

  for(i = 0; i < num_devs; i++) {
	  mem_R[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
	  mem_G[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
	  mem_B[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
  }

/*
  // Write to Buffers
  for(i = 0; i < num_devs; i++) {
  	clEnqueueWriteBuffer(cmd_queues[i],
                         mem_CHECK[i], 
                         CL_FALSE, 0,
                         size_CHECK / num_devs,
                         (CHECK + (N / num_devs) * i),
                         0, NULL, NULL);
  }
*/

  // Set the arguments.
  for(i = 0; i < num_devs; i++) {
//	  flag = i * (m * n / num_devs);
  	clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*) &mem_R[i]);
	clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*) &mem_G[i]);
  	clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*) &mem_B[i]);

	clSetKernelArg(kernels[i], 3, sizeof(int), &count_max);
	clSetKernelArg(kernels[i], 4, sizeof(float), &x_max);
	clSetKernelArg(kernels[i], 5, sizeof(float), &x_min);
	clSetKernelArg(kernels[i], 6, sizeof(float), &y_max);
	clSetKernelArg(kernels[i], 7, sizeof(float), &y_min);
  }

  // Enqueue the kernel.
  size_t lws[1] = {256};
  size_t gws[1] = { m * n /num_devs };
  gws[0] = (size_t)ceil((double)m * n / lws[0]) * lws[0];
  ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs);
  for(i = 0; i < num_devs; i++) {
 	 err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 1, NULL, gws, lws, 0, NULL, &ev_kernels[i]);
  	 CHECK_ERROR(err);
  }

  // Read the result.
  for(i = 0; i < num_devs; i++) {
  	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_R[i],
                            CL_TRUE, 0,
                            size_color / num_devs,
                            r,
                            1, &ev_kernels[i], NULL);
  	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_G[i],
                            CL_TRUE, 0,
                            size_color / num_devs,
                            g,
							1, &ev_kernels[i], NULL);
   	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_B[i],
                            CL_TRUE, 0,
							size_color / num_devs,
							b,
                            1, &ev_kernels[i], NULL);
  }

  // Release
  for( i = 0; i < num_devs; ++i ) {
  clFinish(cmd_queues[i]); 
  clReleaseMemObject(mem_R[i]);
  clReleaseMemObject(mem_G[i]);
  clReleaseMemObject(mem_B[i]);
  clReleaseKernel(kernels[i]);
  clReleaseCommandQueue(cmd_queues[i]);
  clReleaseEvent(ev_kernels[i]);
  }
  clReleaseProgram(program);
  clReleaseContext(context);
  free(mem_R);
  free(mem_G);
  free(mem_B);
  free(cmd_queues);
  free(kernels);
  free(devs);
  free(ev_kernels);
  free(platform);

  timer_stop(0);
  wtime = timer_read(0);
  printf( "\n" );
  printf( "  Time = %lf seconds.\n", wtime );

  // Write data to an ASCII PPM file.
  output_unit = fopen( output_filename, "wt" );

  fprintf( output_unit, "P3\n" );
  fprintf( output_unit, "%d  %d\n", n, m );
  fprintf( output_unit, "%d\n", 255 );
  for ( i = 0; i < m; i++ )
  {
    for ( jlo = 0; jlo < n; jlo = jlo + 4 )
    {
      jhi = MIN( jlo + 4, n );
      for ( j = jlo; j < jhi; j++ )
      {
        fprintf( output_unit, "  %d  %d  %d", r[i][j], g[i][j], b[i][j] );
      }
      fprintf( output_unit, "\n" );
    }
  }

  fclose( output_unit );
  printf( "\n" );
  printf( "  Graphics data written to \"%s\".\n\n", output_filename );

  // Terminate.
  free(r);
  free(g);
  free(b);
}
Example #12
0
int
main (int argc, const char **argv)
{
    OclPlatform *ocl;
    cl_program program;
    cl_device_id *devices;
    cl_command_queue *queues;
    cl_kernel kernel;
    cl_int errcode;
    int num_devices;
    GTimer *timer;

    ocl = ocl_new_from_args (argc, argv, CL_QUEUE_PROFILING_ENABLE);

    program = ocl_create_program_from_source (ocl, source, NULL, &errcode);
    OCL_CHECK_ERROR (errcode);

    kernel = clCreateKernel (program, "touch", &errcode);
    OCL_CHECK_ERROR (errcode);

    num_devices = ocl_get_num_devices (ocl);
    devices = ocl_get_devices (ocl);
    queues = ocl_get_cmd_queues (ocl);
    timer = g_timer_new ();

    for (int i = 0; i < num_devices; i++) {
        char name[256];
        cl_event event;
        size_t size = 16;
        const int NUM_RUNS = 50000;
        unsigned long total_wait = 0;
        unsigned long total_execution = 0;
        double wall_clock = 0.0;

        for (int r = 0; r < NUM_RUNS; r++) {
            unsigned long wait;
            unsigned long execution;

            g_timer_start (timer);
            OCL_CHECK_ERROR (clEnqueueNDRangeKernel (queues[i], kernel, 
                        1, NULL, &size, NULL,
                        0, NULL, &event));

            clWaitForEvents (1, &event);
            g_timer_stop (timer);

            wall_clock += g_timer_elapsed (timer, NULL);

            get_event_times (event, &wait, &execution);
            clReleaseEvent (event);

            total_wait += wait;
            total_execution += execution;
        }

        OCL_CHECK_ERROR (clGetDeviceInfo (devices[i], CL_DEVICE_NAME, 256, name, NULL));

        /* all times in nano seconds */
        printf ("%s %f %f %f\n", name,
                total_wait / ((double) NUM_RUNS),
                total_execution / ((double) NUM_RUNS),
                wall_clock / NUM_RUNS * 1000 * 1000 * 1000);
    }

    g_timer_destroy (timer);
    clReleaseKernel (kernel);
    clReleaseProgram (program);

    ocl_free (ocl);
}
Example #13
0
int task(cl_context context, cl_device_id device, cl_command_queue queue, void* data_)
{
  const TaskData* data = (const TaskData*) data_;
  cl_int err;

  if (data->points % data->points_per_work_item)
    check_error(CLQMC_INVALID_VALUE, "points must be a multiple of points_per_work_item");

  if (data->replications % data->replications_per_work_item)
    check_error(CLQMC_INVALID_VALUE, "replications must be a multiple of replications_per_work_item");


  // Lattice buffer

  size_t pointset_size;
  // gen_vec is given in common.c
  clqmcLatticeRule* pointset = clqmcLatticeRuleCreate(data->points, DIMENSION, gen_vec, &pointset_size, &err);
  check_error(err, NULL);

  cl_mem pointset_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
      pointset_size, pointset, &err);
  check_error(err, "cannot create point set buffer");


  // Shifts buffer
  
  clqmc_fptype* shifts = (clqmc_fptype*) malloc(data->replications * DIMENSION * sizeof(clqmc_fptype));

  // populate random shifts using a random stream
  clrngMrg31k3pStream* stream = clrngMrg31k3pCreateStreams(NULL, 1, NULL, &err);
  check_error(err, NULL);
  for (cl_uint i = 0; i < data->replications; i++)
      for (cl_uint j = 0; j < DIMENSION; j++)
          shifts[i * DIMENSION + j] = clrngMrg31k3pRandomU01(stream);
  err = clrngMrg31k3pDestroyStreams(stream);
  check_error(err, NULL);

  cl_mem shifts_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
      data->replications * DIMENSION * sizeof(clqmc_fptype), shifts, &err);
  check_error(err, "cannot create shifts buffer");


  // Output buffer

  size_t points_block_count = data->points / data->points_per_work_item;
  cl_mem output_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, 
      data->replications * points_block_count * sizeof(clqmc_fptype), NULL, &err);
  check_error(err, "cannot create output buffer");


  // OpenCL kernel

  cl_program program = build_program_from_file(context, device,
      "client/DocsTutorial/example4_kernel.cl",
      NULL);
  check_error(err, NULL);
  cl_kernel kernel = clCreateKernel(program, "simulateWithRQMC", &err);
  check_error(err, "cannot create kernel");

  int iarg = 0;
  err  = clSetKernelArg(kernel, iarg++, sizeof(pointset_buf), &pointset_buf);
  err |= clSetKernelArg(kernel, iarg++, sizeof(shifts_buf), &shifts_buf);
  err |= clSetKernelArg(kernel, iarg++, sizeof(data->points_per_work_item), &data->points_per_work_item);
  err |= clSetKernelArg(kernel, iarg++, sizeof(data->replications), &data->replications);
  err |= clSetKernelArg(kernel, iarg++, sizeof(output_buf), &output_buf);
  check_error(err, "cannot set kernel arguments");


  // Execution

  cl_event ev;
  size_t global_size = (data->replications / data->replications_per_work_item) * points_block_count;
  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &ev);
  check_error(err, "cannot enqueue kernel");

  err = clWaitForEvents(1, &ev);
  check_error(err, "error waiting for events");

  clqmc_fptype* output = (clqmc_fptype*) malloc(data->replications * points_block_count * sizeof(clqmc_fptype));
  err = clEnqueueReadBuffer(queue, output_buf, CL_TRUE, 0,
      data->replications * points_block_count * sizeof(clqmc_fptype), output, 0, NULL, NULL);
  check_error(err, "cannot read output buffer");

  printf("\nAdvanced randomized quasi-Monte Carlo integration:\n\n");

  err = clqmcLatticeRuleWriteInfo(pointset, stdout);
  check_error(err, NULL);
  printf("\n");

  rqmcReport(data->replications, data->points, points_block_count, output);


  // Clean up

  clReleaseEvent(ev);
  clReleaseMemObject(output_buf);
  clReleaseMemObject(pointset_buf);
  clReleaseKernel(kernel);
  clReleaseProgram(program);

  free(output);
  err = clqmcLatticeRuleDestroy(pointset);
  check_error(err, NULL);

  return EXIT_SUCCESS;
}
Example #14
0
int main(int argc, char **argv)
{
	cl_platform_id platforms[100];
	cl_uint platforms_n = 0;
	CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

	printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
	for (int i=0; i<platforms_n; i++)
	{
		char buffer[10240];
		printf("  -- %d --\n", i);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
		printf("  PROFILE = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
		printf("  VERSION = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
		printf("  NAME = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
		printf("  VENDOR = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
		printf("  EXTENSIONS = %s\n", buffer);
	}

	cl_device_id devices[100];
	cl_uint devices_n = 0;
	// CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
	CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

	printf("=== %d OpenCL device(s) found on platform:\n", platforms_n);
	for (int i=0; i<devices_n; i++)
	{
		char buffer[10240];
		cl_uint buf_uint;
		cl_ulong buf_ulong;
		printf("  -- %d --\n", i);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_NAME = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VENDOR = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DRIVER_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
		printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
	}

	if (devices_n == 0)
		return 1;

	cl_context context;
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err));

	const char *program_source[] = {
		"__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n",
		"{\n",
		"	int i = get_global_id(0);\n",
		"	dst[i] = src[i] * factor;\n",
		"}\n"
	};

	cl_program program;
	program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err));
	if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) {
		char buffer[10240];
		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
		fprintf(stderr, "CL Compilation failed:\n%s", buffer);
		abort();
	}
	CL_CHECK(clUnloadCompiler());

	cl_mem input_buffer;
	input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	int factor = 2;

	cl_kernel kernel;
	kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err));
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

	cl_command_queue queue;
	queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err));

	for (int i=0; i<NUM_DATA; i++) {
		CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL));
	}

	cl_event kernel_completion;
	size_t global_work_size[1] = { NUM_DATA };
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
	CL_CHECK(clWaitForEvents(1, &kernel_completion));
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA; i++) {
		int data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL));
		printf(" %d", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(input_buffer));
	CL_CHECK(clReleaseMemObject(output_buffer));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
Example #15
0
static void
fill_cc_ocl_gpu_wrapper_helmholtzbem3d(void *data, uint kernel)
{
  merge_data_nf *mdata = (merge_data_nf *) data;

  cl_int    res;
  cl_uint   nq2;
  cl_uint   triangles;
  uint      current_device;
  uint      current_queue;
  uint      current_kernel_id;
  uint      current_thread;
  cl_uint   num_devices = ocl_system.num_devices;
  cl_uint   num_queues = ocl_system.queues_per_device;
  cl_event  h2d[2], calc;
  cl_kernel oclkernel;
  cl_command_queue queue;
  size_t    global_off[] = {
    0
  };
  size_t    local_size[] = {
    128
  };
  size_t    global_size[] = {
    ((mdata->pos + local_size[0] - 1) / local_size[0]) * local_size[0]
  };
  field     k = mdata->bem->k;
  field     alpha = mdata->bem->alpha;

  /****************************************************
   * Determine device, queue and kernel.
   ****************************************************/

  current_device = omp_get_thread_num() / num_queues;
  current_queue = omp_get_thread_num() % num_queues;
  current_thread = current_queue + current_device * num_queues;
  current_kernel_id = current_queue + current_device * num_queues
    + kernel * num_devices * num_queues;
  oclkernel = ocl_bem3d.kernels[current_kernel_id];
  queue = ocl_system.queues[current_thread];

  /****************************************************
   * Transfer input data.
   ****************************************************/

  res = clEnqueueWriteBuffer(queue, ocl_bem3d.mem_ridx[current_thread],
			     CL_FALSE, 0, mdata->pos * sizeof(uint),
			     mdata->ridx, 0, NULL, h2d + 0);
  CL_CHECK(res)

    res = clEnqueueWriteBuffer(queue, ocl_bem3d.mem_cidx[current_thread],
			       CL_FALSE, 0, mdata->pos * sizeof(uint),
			       mdata->cidx, 0, NULL, h2d + 1);
  CL_CHECK(res)

  /****************************************************
   * Setup kernel arguments for 'assemble_xxx_cc_list_z'
   ****************************************************/
    triangles = ocl_bem3d.triangles;

  if (kernel == 0 || kernel == 4) {
    nq2 = ocl_bem3d.nq;
    res = clSetKernelArg(oclkernel, 0, sizeof(cl_mem),
			 &ocl_bem3d.mem_q_xw[current_device]);
    CL_CHECK(res)
      res = clSetKernelArg(oclkernel, 1, sizeof(cl_uint), &nq2);
  }
  else {
    nq2 = ocl_bem3d.nq2;
    res = clSetKernelArg(oclkernel, 0, sizeof(cl_mem),
			 &ocl_bem3d.mem_q2_xw[current_device]);
    CL_CHECK(res)
      res = clSetKernelArg(oclkernel, 1, sizeof(cl_uint), &nq2);
  }
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 2, sizeof(cl_mem),
			 &ocl_bem3d.mem_gr_t[current_device]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 3, sizeof(cl_mem),
			 &ocl_bem3d.mem_gr_x[current_device]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 4, sizeof(cl_uint), &triangles);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 5, sizeof(cl_mem),
			 &ocl_bem3d.mem_ridx[current_thread]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 6, sizeof(cl_mem),
			 &ocl_bem3d.mem_cidx[current_thread]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 7, sizeof(cl_mem),
			 &ocl_bem3d.mem_N[current_thread]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 8, sizeof(cl_uint), &mdata->pos);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 9, sizeof(field), &k);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 10, sizeof(field), &alpha);
  CL_CHECK(res)

  /****************************************************
   * Invoke the kernel 'assemble_xxx_cc_list_z' on the GPU
   ****************************************************/
    res = clEnqueueNDRangeKernel(queue, oclkernel, 1, global_off, global_size,
				 local_size, 2, h2d, &calc);
  CL_CHECK(res)

  /****************************************************
   * Copy results back.
   ****************************************************/
    res =
    clEnqueueReadBuffer(queue, ocl_bem3d.mem_N[current_thread], CL_TRUE, 0,
			mdata->pos * sizeof(field), mdata->N, 1, &calc, NULL);
  CL_CHECK(res)

    clReleaseEvent(h2d[0]);
  clReleaseEvent(h2d[1]);
  clReleaseEvent(calc);
}
Example #16
0
int 
NBody::runCLKernels()
{
    cl_int status;
    cl_event events[1];

    /* 
    * Enqueue a kernel run call.
    */
    size_t globalThreads[] = {numBodies};
    size_t localThreads[] = {groupSize};

    if(localThreads[0] > maxWorkItemSizes[0] ||
       localThreads[0] > maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device"
            "does not support requested number of work items.";
        return SDK_FAILURE;
    }

    status = clEnqueueNDRangeKernel(
        commandQueue,
        kernel,
        1,
        NULL,
        globalThreads,
        localThreads,
        0,
        NULL,
        NULL);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clEnqueueNDRangeKernel failed."))
    {
        return SDK_FAILURE;
    }

    status = clFinish(commandQueue);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clFinish failed."))
    {
        return SDK_FAILURE;
    }

    /* Copy data from new to old */
    status = clEnqueueCopyBuffer(commandQueue,
                                 newPos,
                                 currPos,
                                 0,
                                 0,
                                 sizeof(cl_float4) * numBodies,
                                 0,
                                 0,
                                 0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clEnqueueCopyBuffer failed.(newPos->oldPos)"))
    {
        return SDK_FAILURE;
    }

    status = clEnqueueCopyBuffer(commandQueue,
                                 newVel,
                                 currVel,
                                 0,
                                 0,
                                 sizeof(cl_float4) * numBodies,
                                 0,
                                 0,
                                 0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clEnqueueCopyBuffer failed.(newVel->oldVels)"))
    {
        return SDK_FAILURE;
    }

    status = clFinish(commandQueue);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clFinish failed."))
    {
        return SDK_FAILURE;
    }

    /* Enqueue readBuffer*/
    status = clEnqueueReadBuffer(
        commandQueue,
        currPos,
        CL_TRUE,
        0,
        numBodies* sizeof(cl_float4),
        pos,
        0,
        NULL,
        &events[0]);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clEnqueueReadBuffer failed."))
        return SDK_FAILURE;

    /* Wait for the read buffer to finish execution */
    status = clWaitForEvents(1, &events[0]);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clWaitForEvents failed."))
        return SDK_FAILURE;

    clReleaseEvent(events[0]);

    return SDK_SUCCESS;
}
Example #17
0
float sgemmMain(int rowa,int cola,int colb)
{
	 cl_context context = 0;
	 cl_command_queue commandQueue = 0;
	 cl_program program = 0;
	 cl_device_id device = 0;
	 cl_kernel kernel = 0;
	 const unsigned int numberOfMemoryObjects = 3;
	 cl_mem memoryObjectsa = 0;
	 cl_mem memoryObjectsb = 0;
	 cl_mem memoryObjectsc = 0;
	 cl_int errorNumber;
	 cl_uint clrowa = rowa;
	 cl_uint clcola = cola;
	 cl_uint clcolb = colb;
	 int err;
	 err = createContext(&context);
	 LOGD("create context");
	 err = createCommandQueue(context, &commandQueue, &device);
	 err = createProgram(context, device, "/mnt/sdcard/kernel/sgemm.cl", &program);
	 kernel = clCreateKernel(program, "sgemm", &errorNumber);
	 LOGD("createKernel code %d",errorNumber);
	 LOGD("start computing");
	 float alpha = 1;
	 float beta = 0.1;

	 /* Create the matrices. */
	 size_t matrixSizea = rowa * cola;
	 size_t matrixSizeb = cola * colb;
	 size_t matrixSizec = rowa * colb;

	 /* As all the matrices have the same size, the buffer size is common. */
	 size_t bufferSizea = matrixSizea * sizeof(float);
	 size_t bufferSizeb = matrixSizeb * sizeof(float);
	 size_t bufferSizec = matrixSizec * sizeof(float);

	 /* Create buffers for the matrices used in the kernel. */
	 int createMemoryObjectsSuccess = 0;
	 memoryObjectsa = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizea, NULL, &errorNumber);
	 createMemoryObjectsSuccess &= errorNumber;
	 memoryObjectsb = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizeb, NULL, &errorNumber);
	 createMemoryObjectsSuccess &= errorNumber;
	 memoryObjectsc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizec, NULL, &errorNumber);
	 createMemoryObjectsSuccess &= errorNumber;
	 LOGD("create memory err %d",createMemoryObjectsSuccess);
	 int mapMemoryObjectsSuccess = 0;
	 cl_float* matrixA = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsa, CL_TRUE, CL_MAP_WRITE, 0, bufferSizea, 0, NULL, NULL, &errorNumber);
	 mapMemoryObjectsSuccess &= errorNumber;
	 cl_float* matrixB = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsb, CL_TRUE, CL_MAP_WRITE, 0, bufferSizeb, 0, NULL, NULL, &errorNumber);
	 mapMemoryObjectsSuccess &= errorNumber;
	 cl_float* matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_WRITE, 0, bufferSizec, 0, NULL, NULL, &errorNumber);
	 mapMemoryObjectsSuccess &= errorNumber;
	 LOGD("map memory err %d",mapMemoryObjectsSuccess);

	 sgemmInitialize(rowa,cola,colb, matrixA, matrixB, matrixC);
	 LOGD("data initial finish");
	 int unmapMemoryObjectsSuccess = 0;
	 errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsa, matrixA, 0, NULL, NULL);
	 LOGD("memory code %d",errorNumber);
	 unmapMemoryObjectsSuccess &= errorNumber;
	 errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsb, matrixB, 0, NULL, NULL);
	 LOGD("memory code %d",errorNumber);
	 unmapMemoryObjectsSuccess &= errorNumber;
	 errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL);
	 LOGD("memory code %d",errorNumber);
	 unmapMemoryObjectsSuccess &= errorNumber;
	 LOGD("unmap memory err %d",unmapMemoryObjectsSuccess);

	 int setKernelArgumentsSuccess = 0;
	 errorNumber = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjectsa);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjectsb);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjectsc);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 3, sizeof(cl_uint), &clrowa);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 4, sizeof(cl_uint), &clcola);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 5, sizeof(cl_uint), &clcolb);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 6, sizeof(cl_float), &alpha);
	 setKernelArgumentsSuccess &= errorNumber;
	 errorNumber = clSetKernelArg(kernel, 7, sizeof(cl_float), &beta);
	 setKernelArgumentsSuccess &= errorNumber;
	 LOGD("setKernel err %d",setKernelArgumentsSuccess);

	 LOGD("start running kernel");
	 clock_t start_t,end_t;
	 float cost_time;
	 start_t = clock();
	 cl_event event = 0;
	 size_t globalWorksize[2] = {rowa, colb};
	 errorNumber = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorksize, NULL, 0, NULL, &event);
	 //LOGD("Enqueue err code %d",errorNumber);
	 errorNumber = clFinish(commandQueue);
	 end_t = clock();
	 cost_time = (float)(end_t-start_t)/CLOCKS_PER_SEC*1000;
	 LOGD("Finish err code %d",errorNumber);
	 float time;
	 time = printProfilingInfo(event);
	 LOGT("using CPU clock: %f ms",cost_time);
	 LOGT("using GPU clock: %f ms",time);
	 clReleaseEvent(event);
	 matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_READ, 0, bufferSizec, 0, NULL, NULL, &errorNumber);
	 clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL);
	 LOGD("read out matrixC finish");
	 LOGD("matrixC value C(0,0): %f",matrixC[0]);
	 cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjectsa, memoryObjectsb,memoryObjectsc,numberOfMemoryObjects);
	 LOGD("RUNNING finsh");
	 return time;
}
int
BoxFilterGLSeparable::runCLKernels()
{
    cl_int status;
    cl_event events[2];



    /* Set appropriate arguments to the kernel */

    /* input buffer image */
    status = clSetKernelArg(
                 horizontalKernel,
                 0,
                 sizeof(cl_mem),
                 &inputImageBuffer);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clSetKernelArg failed. (inputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* outBuffer imager */
    status = clSetKernelArg(
                 horizontalKernel,
                 1,
                 sizeof(cl_mem),
                 &tempImageBuffer);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clSetKernelArg failed. (outputImageBuffer)"))
    {
        return SDK_FAILURE;
    }


    /* filter width */
    status = clSetKernelArg(
                 horizontalKernel,
                 2,
                 sizeof(cl_int),
                 &filterWidth);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clSetKernelArg failed. (filterWidth)"))
    {
        return SDK_FAILURE;
    }

#ifdef USE_LDS
    /* shared memory */
    status = clSetKernelArg(
                 horizontalKernel,
                 3,
                 (GROUP_SIZE + filterWidth - 1) * sizeof(cl_uchar4),
                 0);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clSetKernelArg failed. (local memory)"))
    {
        return SDK_FAILURE;
    }
#endif
    /*
    * Enqueue a kernel run call.
    */
    size_t globalThreads[] = {width, height};
    size_t localThreads[] = {blockSizeX, blockSizeY};

    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 horizontalKernel,
                 2,
                 NULL,
                 globalThreads,
                 localThreads,
                 0,
                 NULL,
                 &events[0]);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clEnqueueNDRangeKernel failed."))
    {
        return SDK_FAILURE;
    }

    status = clWaitForEvents(1, &events[0]);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clWaitForEvents failed."))
        return SDK_FAILURE;

    /* Do vertical pass */

    /* Set appropriate arguments to the kernel */

    /* input buffer image */
    status = clSetKernelArg(
                 verticalKernel,
                 0,
                 sizeof(cl_mem),
                 &tempImageBuffer);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clSetKernelArg failed. (inputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    // Acquire GL buffer
    status = clEnqueueAcquireGLObjects(commandQueue,
                                       1,
                                       &outputImageBuffer,
                                       0,
                                       0,
                                       NULL);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clEnqueueAcquireGLObjects failed."))
        return SDK_FAILURE;

    /* outBuffer imager */
    status = clSetKernelArg(
                 verticalKernel,
                 1,
                 sizeof(cl_mem),
                 &outputImageBuffer);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clSetKernelArg failed. (outputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* filter width */
    status = clSetKernelArg(
                 verticalKernel,
                 2,
                 sizeof(cl_int),
                 &filterWidth);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clSetKernelArg failed. (filterWidth)"))
    {
        return SDK_FAILURE;
    }

    /*
    * Enqueue a kernel run call.
    */
    //size_t globalThreads[] = {width, height};
    //size_t localThreads[] = {blockSizeX, blockSizeY};

    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 verticalKernel,
                 2,
                 NULL,
                 globalThreads,
                 localThreads,
                 0,
                 NULL,
                 &events[0]);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clEnqueueNDRangeKernel failed."))
    {
        return SDK_FAILURE;
    }

    status = clWaitForEvents(1, &events[0]);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clWaitForEvents failed."))
        return SDK_FAILURE;

    /* Read buffer only if verification flag is true */
    if(verify)
    {
        /* Enqueue readBuffer*/
        status = clEnqueueReadBuffer(
                     commandQueue,
                     outputImageBuffer,
                     CL_TRUE,
                     0,
                     width * height * pixelSize,
                     outputImageData,
                     0,
                     NULL,
                     &events[0]);
        if(!sampleCommon->checkVal(
                    status,
                    CL_SUCCESS,
                    "clEnqueueReadBuffer failed."))
            return SDK_FAILURE;

        /* Wait for the read buffer to finish execution */
        status = clWaitForEvents(1, &events[0]);
        if(!sampleCommon->checkVal(
                    status,
                    CL_SUCCESS,
                    "clWaitForEvents failed."))
            return SDK_FAILURE;
    }

    /* Now OpenGL gets control of outputImageBuffer */
    status = clEnqueueReleaseGLObjects(commandQueue,
                                       1,
                                       &outputImageBuffer,
                                       0,
                                       0,
                                       0);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clEnqueueReleaseGLObjects failed."))
        return SDK_FAILURE;

    status = clFinish(commandQueue);
    if(!sampleCommon->checkVal(
                status,
                CL_SUCCESS,
                "clFinish failed."))
        return SDK_FAILURE;

    clReleaseEvent(events[0]);

    return SDK_SUCCESS;
}
int 
BinomialOption::runCLKernels()
{
    cl_int status;

     /*
     * This algorithm reduces each group of work-items to a single value
     * on OpenCL device
     */

    /* Set appropriate arguments to the kernel */

    /* number of steps */
    status = clSetKernelArg(kernel, 0, sizeof(int), (void*)&numSteps);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clSetKernelArg failed. (numSteps)"))
    {
        return SDK_FAILURE;
    }

    /* randBuffer */
    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&randBuffer);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clSetKernelArg failed. (randBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* outBuffer */
    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&outBuffer);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clSetKernelArg failed. (outBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* local memory callA */
    status = clSetKernelArg(kernel,
                            3,
                            (numSteps + 1) * sizeof(cl_float4),
                            NULL);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clSetKernelArg failed. (callA)"))
    {
        return SDK_FAILURE;
    }

    /* local memory callB */
    status = clSetKernelArg(kernel,
                            4,
                            numSteps * sizeof(cl_float4),
                            NULL);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clSetKernelArg failed. (callB)"))
    {
        return SDK_FAILURE;
    }

    /**
     * Enqueue a kernel run call.
     */
    size_t globalThreads[] = {numSamples * (numSteps + 1)};
    size_t localThreads[] = {numSteps + 1};

    if(localThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device does not support"
            "requested number of work items.";
        return SDK_FAILURE;
    }

    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[deviceId],
                                      CL_KERNEL_LOCAL_MEM_SIZE,
                                      sizeof(cl_ulong),
                                      &usedLocalMemory,
                                      NULL);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetKernelWorkGroupInfo"
                               "CL_KERNEL_LOCAL_MEM_SIZE failed."))
    {
        return SDK_FAILURE;
    }

    if(usedLocalMemory > totalLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device."
            << std::endl;
        return SDK_FAILURE;
    }
    

    status = clEnqueueNDRangeKernel(commandQueue,
                                    kernel,
                                    1,
                                    NULL,
                                    globalThreads,
                                    localThreads,
                                    0,
                                    NULL,
                                    NULL);
    if(!sampleCommon->checkVal(status, 
                               CL_SUCCESS, 
                               "clEnqueueNDRangeKernel failed."))
    {
        return SDK_FAILURE;
    }

    status = clFinish(commandQueue);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clFinish failed."))
    {
        return SDK_FAILURE;
    }
   
    cl_event events[1]; 
    /* Enqueue readBuffer*/
    status = clEnqueueReadBuffer(commandQueue,
                                 outBuffer,
                                 CL_TRUE,
                                 0,
                                 numSamples * sizeof(cl_float4),
                                 output,
                                 0,
                                 NULL,
                                 &events[0]);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clEnqueueReadBuffer failed."))
    {
        return SDK_FAILURE;
    }
    
    /* Wait for the read buffer to finish execution */
    status = clWaitForEvents(1, &events[0]);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clWaitForEvents failed."))
    {
        return SDK_FAILURE;
    }

    clReleaseEvent(events[0]);

    return SDK_SUCCESS;
}
Example #20
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufAsum, scratchBuff;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
    bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasSasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float),
                                    &asum, 0, NULL, NULL);
        printf("Result : %f\n", asum);
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAsum);
    clReleaseMemObject(scratchBuff);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Example #21
0
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
    shrQAStart(argc, argv);
	int NUM_BLOCKS = 10;
	shrSetLogFileName ("Barrier_Centralized.txt");
	while(NUM_BLOCKS<=120)
	{
	
	
	int iNumElements = NUM_BLOCKS* NUM_THREADS;	// total num of threads
	// BARRIER GOAL
	int goal_val = NUM_BLOCKS;
	// get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
    
    // start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("Barrier.txt");
    shrLog("%s Starting...\n\n# of THREADS \t= %i\n", argv[0], iNumElements); 

    // set and log Global and Local work size dimensions
    szLocalWorkSize = NUM_THREADS ;
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
    shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

    

    //Get an OpenCL platform
    ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);

    shrLog("clGetPlatformID...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Get the devices
    ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    shrLog("clGetDeviceIDs...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
    shrLog("clCreateContext...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErr1);
    shrLog("clCreateCommandQueue...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


	

    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);

    // Create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
    shrLog("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Build the program with 'mad' Optimization option
    #ifdef MAC
        char* flags = "-cl-fast-relaxed-math -DMAC";
    #else
        char* flags = "-cl-fast-relaxed-math";
    #endif
    ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    shrLog("clBuildProgram...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "Barrier", &ciErr1);
    shrLog("clCreateKernel (Barrier)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }



	 // Allocate and initialize host arrays 
    shrLog( "Allocate and Init Host Mem...\n"); 
    input = (int *)malloc(sizeof(int) * NUM_BLOCKS);

	for(int i =0; i<=NUM_BLOCKS; i++)
	{
		input[i]=0;

	}

	// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    array_in = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1);
    array_out = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1);
	
	if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


    // Set the Argument values
    
    ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_int), (void*)&goal_val);
	ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&array_in);
	ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&array_out);

   // ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_int), (void*)&iNumElements);
    shrLog("clSetKernelArg 0 - 2...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }






    // --------------------------------------------------------
    // Start Core sequence... copy input data to GPU, compute, copy results back



	ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, array_in, CL_FALSE, 0, sizeof(int) * NUM_BLOCKS,(void*) input, 0, NULL, NULL);
    
    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


    // Launch kernel
    ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent);
    shrLog("clEnqueueNDRangeKernel (Barrier)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

   /*ciErr1 = clEnqueueReadBuffer(cqCommandQueue, global_mutex, CL_TRUE, 0, sizeof(cl_int), &original_goal, 0, NULL, NULL);
    shrLog("clEnqueueReadBuffer (Dst)...%d \n\n", original_goal); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }*/


	//GPU_PROFILING
    ciErr1=clWaitForEvents(1, &ceEvent);
	if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error 1 !\n\n");
        Cleanup(argc, argv, EXIT_FAILURE);
    }
       
        cl_ulong start, end;
     ciErr1 =   clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
      ciErr1 |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error 2 !\n\n");
        Cleanup(argc, argv, EXIT_FAILURE);
    }
        double dSeconds = 1.0e-9 * (double)(end - start);
		shrLog("Done! time taken %ul \n",end - start );
      // shrLog("Done! Kernel execution time: %.5f s\n\n", dSeconds);


		// Release event
       clReleaseEvent(ceEvent);
       ceEvent = 0;

    
		    Cleanup (argc, argv,  EXIT_SUCCESS);

			NUM_BLOCKS = NUM_BLOCKS+10;
		}

		shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
Example #22
0
int 
URNG::runCLKernels()
{
    cl_int status;
    cl_event events[2];

    /* Set appropriate arguments to the kernel */

    /* input buffer image */
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImageBuffer);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clSetKernelArg failed. (inputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* outBuffer imager */
    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),&outputImageBuffer);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clSetKernelArg failed. (outputImageBuffer)"))
    {
        return SDK_FAILURE;
    }

    /* input buffer image */
    status = clSetKernelArg(kernel, 2, sizeof(factor), &factor);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clSetKernelArg failed. (factor)"))
    {
        return SDK_FAILURE;
    }


    /* 
    * Enqueue a kernel run call.
    */
    size_t globalThreads[] = {width, height};
    size_t localThreads[] = {blockSizeX, blockSizeY};

    status = clEnqueueNDRangeKernel(
        commandQueue,
        kernel,
        2,
        NULL,
        globalThreads,
        localThreads,
        0,
        NULL,
        &events[0]);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clEnqueueNDRangeKernel failed."))
    {
        return SDK_FAILURE;
    }

    status = clWaitForEvents(1, &events[0]);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clWaitForEvents failed."))
        return SDK_FAILURE;


    /* Enqueue readBuffer*/
    status = clEnqueueReadBuffer(
        commandQueue,
        outputImageBuffer,
        CL_TRUE,
        0,
        width * height * pixelSize,
        outputImageData,
        0,
        NULL,
        &events[0]);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clEnqueueReadBuffer failed."))
        return SDK_FAILURE;

    /* Wait for the read buffer to finish execution */
    status = clWaitForEvents(1, &events[0]);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clWaitForEvents failed."))
        return SDK_FAILURE;

    clReleaseEvent(events[0]);


    return SDK_SUCCESS;
}
Example #23
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufX;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_float2),
                          NULL, &err);
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2),
                          NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
        N * lda * sizeof(cl_float2), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
        N * sizeof(cl_float2), X, 0, NULL, NULL);


    err = clblasCher(order, uplo, N, alpha, bufX, 0 /*offx */, incx,
										bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event);

   	if (err != CL_SUCCESS) {
        printf("clblasCher() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_float2)),
                                  A, 0, NULL, NULL);
        /* At this point you will get the result of CHER placed in A array. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufA);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
void
filter_OpenCL_impl(ComputeEnv *env,
                   Buffer *packed_input_buf,
                   Buffer *packed_output_buf,
                   int nInputPlanes,
                   int nOutputPlanes,
                   const float *fbiases,
                   const float *weight,
                   int w,
                   int h,
                   int nJob)
{
        cl_int err;
        int dev_id = 0;

        OpenCLDev *dev = &env->cl_dev_list[dev_id];
	size_t in_size = sizeof(float) * w * h * nInputPlanes;
        cl_context context = dev->context;

        cl_mem cl_packed_input = packed_input_buf->get_read_ptr_cl(env, dev_id, in_size);
        cl_mem cl_packed_output = packed_output_buf->get_write_ptr_cl(env, dev_id);

        cl_mem cl_fbiases = clCreateBuffer(context,
                                           CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
                                           sizeof(float) * nOutputPlanes,
                                           (void*)fbiases, &err
                );
        enum filter_type {
                FILTER_GENERIC,
                FILTER_IN1,
                FILTER_IN3,
                FILTER_OUT1,
                FILTER_OUT3,
        } type = FILTER_GENERIC;

        cl_kernel ker = dev->ker_filter;
        bool static_nplane = false;

        if (nInputPlanes == 1 && nOutputPlanes == 32) {
                type = FILTER_IN1;
                ker = dev->ker_filter_in1_out32;
        } else if (nInputPlanes == 3 && nOutputPlanes == 32) {
                type = FILTER_IN3;
                ker = dev->ker_filter_in3_out32;
                static_nplane = true;
        } else if (nOutputPlanes == 1 && nInputPlanes == 128) {
                type = FILTER_OUT1;
                ker = dev->ker_filter_in128_out1;
        } else if (nOutputPlanes == 3 && nInputPlanes == 128) {
                type = FILTER_OUT3;
                ker = dev->ker_filter_in128_out3;
                static_nplane = true;
        }


        size_t weight_size;

        if (type == FILTER_GENERIC) {
                weight_size = sizeof(float) * GPU_VEC_WIDTH * nInputPlanes * 9;
        } else {
                weight_size = sizeof(float) * nOutputPlanes * nInputPlanes * 9;
        }

        cl_mem cl_weight = clCreateBuffer(context,
                                          CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
                                          weight_size,
                                          (void*)weight, &err
                );

        int ai = 0;

        clSetKernelArg(ker, ai++, sizeof(cl_mem), &cl_packed_input);
        if (! static_nplane) {
                clSetKernelArg(ker, ai++, sizeof(cl_int), &nInputPlanes);
        }
        clSetKernelArg(ker, ai++, sizeof(cl_mem), &cl_packed_output);
        if (! static_nplane) {
                clSetKernelArg(ker, ai++, sizeof(cl_int), &nOutputPlanes);
        }
        clSetKernelArg(ker, ai++, sizeof(cl_mem), &cl_fbiases);
        clSetKernelArg(ker, ai++, sizeof(cl_int), &h);
        clSetKernelArg(ker, ai++, sizeof(cl_int), &w);
        clSetKernelArg(ker, ai++, sizeof(cl_mem), &cl_weight);

        size_t local_size = 0;
        //local_size += sizeof(float) * 256;
        //local_size += sizeof(float) * GPU_VEC_WIDTH;

        if (type == FILTER_GENERIC) {
                local_size += sizeof(float) * nInputPlanes * (GPU_BLOCK_SIZE+2) * 3;
                clSetKernelArg(ker, ai++, local_size, nullptr);
        }

        cl_event event;

        size_t gws[3] = {1, 1, 1};
        size_t lws[3] = {1, 1, 1};
        if (type == FILTER_GENERIC) {
                gws[0] = h * nOutputPlanes;
                lws[0] = nOutputPlanes;
        } else if (type == FILTER_IN1) {
                gws[0] = h * 256;
                lws[0] = 256;
        } else if (type == FILTER_OUT1 || type == FILTER_OUT3) {
                gws[0] = h*128;
                lws[0] = 128;
        } else if (type == FILTER_IN3) {
                gws[0] = h * 192;
                lws[0] = 192;
        }

        err = clEnqueueNDRangeKernel(dev->queue,
                                     ker,
                                     3,
                                     nullptr, gws, lws,
                                     0, nullptr, &event);
        if (err != CL_SUCCESS) {
                printf("enqueue ndrange error : %d\n", err);
                exit(1);
        }

        err = clWaitForEvents(1, &event);
        if (err != CL_SUCCESS) {
                printf("wait ndrange error : %d\n", err);
                exit(1);
        }

        if (err != CL_SUCCESS) {
                printf("read buffer error : %d\n", err);
                exit(1);
        }

        clReleaseMemObject(cl_fbiases);
        clReleaseMemObject(cl_weight);
        clReleaseEvent(event);
}
int
LDSBandwidth::bandwidth(cl_kernel &kernel)
{
    cl_int status;

    // Check group size against kernelWorkGroupSize
    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[sampleArgs->deviceId],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");

    if(localThreads > kernelWorkGroupSize)
    {
        localThreads = kernelWorkGroupSize;
    }

    // Set appropriate arguments to the kernel

    size_t size = (NUM_READS + localThreads) * vectorSize * sizeof(cl_float);
    // Local memory
    status = clSetKernelArg(kernel,
                            0,
                            size,
                            0);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(local memory)");

    // Output buffer
    status = clSetKernelArg(kernel,
                            1,
                            sizeof(cl_mem),
                            (void *)&outputBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(outputBuffer)");

    // Get used local memory
    status =  clGetKernelWorkGroupInfo(kernel,
                                       devices[sampleArgs->deviceId],
                                       CL_KERNEL_LOCAL_MEM_SIZE,
                                       sizeof(cl_ulong),
                                       &usedLocalMemory,
                                       NULL);
    CHECK_OPENCL_ERROR(status,
                       "clGetKernelWorkGroupInfo CL_KERNEL_LOCAL_MEM_SIZE failed.");

    if(usedLocalMemory > deviceInfo.localMemSize)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_FAILURE;
    }

    double sec = 0;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        iterations = 10;
    }

    // Run the kernel for a number of iterations
    for(int i = 0; i < iterations; i++)
    {
        // Enqueue a kernel run call
        cl_event ndrEvt;
        status = clEnqueueNDRangeKernel(commandQueue,
                                        kernel,
                                        1,
                                        NULL,
                                        &globalThreads,
                                        &localThreads,
                                        0,
                                        NULL,
                                        &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        // wait for the kernel call to finish execution
        status = clWaitForEvents(1, &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clWaitForEvents failed.");

        // Calculate performance
        cl_ulong startTime;
        cl_ulong endTime;

        // Get kernel profiling info
        status = clGetEventProfilingInfo(ndrEvt,
                                         CL_PROFILING_COMMAND_START,
                                         sizeof(cl_ulong),
                                         &startTime,
                                         0);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(startTime)");

        status = clGetEventProfilingInfo(ndrEvt,
                                         CL_PROFILING_COMMAND_END,
                                         sizeof(cl_ulong),
                                         &endTime,
                                         0);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(endTime)");

        // Cumulate time for each iteration
        sec += 1e-9 * (endTime - startTime);

        status = clReleaseEvent(ndrEvt);
        CHECK_OPENCL_ERROR(status, "clReleaseEvent failed.(endTime)");
    }

    // Copy bytes
    int bytesPerThread = 0;
    if(vec3 == true)
    {
        bytesPerThread = NUM_READS * 3 * sizeof(cl_float);
    }
    else
    {
        bytesPerThread = NUM_READS * vectorSize * sizeof(cl_float);
    }
    double bytes = (double)(iterations * bytesPerThread);
    double perf = (bytes / sec) * 1e-9;
    perf *= globalThreads;

    std::cout << ": " << perf << " GB/s" << std::endl;

    return SDK_SUCCESS;
}
Example #26
0
int main(int argc, const char** argv)
{
	size_t x = 512, y = 250000; //y has to be a multiple of ciDeviceCount!
	struct svm_node* px = (struct svm_node*)malloc((x+1)*sizeof(struct svm_node));
	gen_data(px, x, 1, 3);
	struct svm_node* py = (struct svm_node*)malloc((x+1)*y*sizeof(struct svm_node));
	for(size_t i = 0; i < y; ++i) {
		struct svm_node* tmp = py+i*(x+1);
		gen_data(tmp, x, 3,2);
	}
	dtype* result = (dtype*)malloc(y*sizeof(dtype));
	int* pyLength = (int*)malloc(y*sizeof(int));
	
	for(size_t i = 0; i < y; ++i)
	{
		for(size_t j = 0; py[i*(x+1)+j].index >= 0; ++j)
			pyLength[i] = py[i*(x+1)+j].index;
		++pyLength[i];
	}
	
	cl_int err = CL_SUCCESS;
//	cl_platform_id platform = NULL;
//	cl_uint ciDeviceCount = 0;
//	cl_device_id *device = NULL;

	// retrieve devices
	cl_platform_id platform;
	err = clGetPlatformIDs(1, &platform, NULL);
	cl_device_id device;
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);

	size_t localDim  = 256l;
	size_t globalDim = localDim*y;
/*	
	device = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, ciDeviceCount, device, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Failed to get devices:\n%s\n", oclErrorString(err));
		return -1;
	}
	*/
	//Create the context
	cl_context context1 = clCreateContext(0, 1, &device, NULL, NULL, &err);
	if(err != CL_SUCCESS)
	{
		printf("Context creation failed:\n%d\n", err);
		return -1;
	}										 

	// create a command queue for first device the context reported
	cl_command_queue queue = clCreateCommandQueue(context1, device, 0, 0);
	
	// load program from disk
	char *tmp = strdup(argv[0]);
	char* my_dir = dirname(tmp);

//	size_t program_length;
	char path[256];
  	snprintf(path, PATH_MAX - 1, "%s/vecops.cl", my_dir);
 
	cl_program vecops = load_kernel(path, context1);

	if(err != CL_SUCCESS)
	{
		printf("Program creation failed:\n%d\n", (err));
		return -1;
	}
 
	err = clBuildProgram(vecops, 0, NULL, "-I.", NULL, NULL);
	if(err != CL_SUCCESS)
	{
			err = clGetProgramBuildInfo(vecops, device, CL_PROGRAM_BUILD_LOG, 8192, buffer, NULL);
			if(err != CL_SUCCESS)
				printf("Cannot get build info: %d\n", (err));

			printf("Build log:\n%s\n", buffer);
	}
	
	// create kernel
	cl_kernel sparsedot_kernel;
	
#if version == 1
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot1_kernel", &err);
#endif
#if version == 2
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot4_kernel", &err);
#endif
#if version == 3
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot3_kernel", &err);
#endif
	if (err != CL_SUCCESS)
	{
		printf("Kernel creation failed:\n%d\n", (err));
		return -1;
	}
	
	 
	// allocate memory on the devices
	cl_mem px_d, py_d, result_d, pyLength_d;
	
#if version == 1
	px_d = clCreateBuffer(context1,
							 CL_MEM_READ_ONLY,
							 (x+1) * sizeof(struct svm_node),
							 0, &err);
#endif
#if version == 2 || version == 3
	//unpack px
	int size = px[x-1].index+1;

	for(size_t i = 0; i < y; ++i)
		size = size > pyLength[i] ? size : pyLength[i];

	dtype* px_u = (dtype*)calloc(size, sizeof(dtype));

	unpack(px, px_u);
	printf("px size: %d\n", size);
#endif
#if version == 3
	size_t height, width;
	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &height, 0);
	clGetDeviceInfo(Device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &width, 0);

	size_t region[3];
	region[2] = 1;

	region[0] = min(4, size);
	region[1] = (size+2-1) / 4;
		

	cl_image_format px_format;
	px_format.image_channel_order = CL_R;
	px_format.image_channel_data_type = CL_FLOAT;
#endif
#if version == 2
	px_d = clCreateBuffer(context1,
				 CL_MEM_READ_ONLY,
				 size * sizeof(dtype),
				 0, &err);
#endif
#if version == 3
	 px_d = clCreateImage2D(context1, CL_MEM_READ_ONLY, &px_format,
				  region[0], region[1], 0, 0, &err);

#endif
	if(err != CL_SUCCESS)
	{
		printf("Failed to allocate px:\n%d\n", (err));
		return -1;
	}
	py_d = clCreateBuffer(context1,
		 CL_MEM_READ_ONLY,
		 (x+1) * y * sizeof(struct svm_node),
		 0, &err);
	if(err != CL_SUCCESS)
	{
		printf("Failed to allocate px:\n%d\n", (err));
		return -1;
	}
	result_d = clCreateBuffer(context1,
		CL_MEM_WRITE_ONLY,
		y * sizeof(dtype),
		0, 0);
	pyLength_d = clCreateBuffer(context1,
		CL_MEM_READ_ONLY,
		y * sizeof(int),
		0, 0);
	

#if bench
	//start time measurement
	start_timer(0);
#endif

	// copy host vectors to device
	err = CL_SUCCESS;
   
    err |= clEnqueueWriteBuffer(queue, py_d, CL_FALSE, 0, 
								(x+1) * y * sizeof(struct svm_node), py, 0, NULL, NULL);
									

	err |= clEnqueueWriteBuffer(queue, pyLength_d, CL_FALSE, 0, 
								y * sizeof(int), pyLength, 0, NULL, NULL);

#if version == 1
	err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, 
								(x+1) * sizeof(struct svm_node), px, 0, NULL, NULL);
#endif
#if version == 2
	err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, 
								size * sizeof(dtype), px_u, 0, NULL, NULL);
#endif
#if version == 3
	size_t offset[] = {0,0,0};
	err |= clEnqueueWriteImage(queue, px_d, CL_TRUE, offset, region, sizeof(dtype), 0, 
							   px_u, 0, 0, NULL);
#endif
	clFinish(queue);

	 
	if(err != CL_SUCCESS)
	{
		printf("Data transfer to GPU failed:\n%d\n", (err));
		return -1;
	}

#if bench
	stop_timer(0);
	start_timer(1);
#endif
	// set kernel arguments

	clSetKernelArg(sparsedot_kernel, 0, sizeof(cl_mem), (void *) &px_d);
	clSetKernelArg(sparsedot_kernel, 1, sizeof(cl_mem), (void *) &py_d);
	clSetKernelArg(sparsedot_kernel, 2, sizeof(cl_mem), (void *) &result_d);
	clSetKernelArg(sparsedot_kernel, 3, sizeof(cl_mem), (void *) &pyLength_d);
	clSetKernelArg(sparsedot_kernel, 4, sizeof(cl_ulong), (void *) &x);
	clSetKernelArg(sparsedot_kernel, 5, sizeof(cl_ulong), (void *) &y);
//	clSetKernelArg(sparsedot_kernel, 6, sizeof(cl_float8)*localDim, 0);
#if version == 3
		clSetKernelArg(sparsedot_kernel, 7, sizeof(cl_long), (void *) &region[1]) ;		
		clSetKernelArg(sparsedot_kernel, 8, sizeof(cl_long), (void *) &region[0]) ;		
#endif
	clFlush(queue);

	// start kernel
	err = clEnqueueNDRangeKernel(queue, sparsedot_kernel, 1, 0, &globalDim, &localDim,
					   0, NULL, 0);

	if(err != CL_SUCCESS)
	{
		printf("Kernel launch failed:\n%d\n", (err));
		return -1;
	}

	clFinish(queue);
	
#if bench	
	stop_timer(1);
	start_timer(2);
#endif

	cl_event result_gather;
	 
	// Non-blocking copy of result from device to host
	err = clEnqueueReadBuffer(queue, result_d, CL_FALSE, 0, y * sizeof(dtype), 
						result, 0, NULL, &result_gather);
	
	if(err != CL_SUCCESS)
	{
		printf("Reading result failed:\n%d\n", (err));
		return -1;
	}

	// CPU sync with GPU
	clWaitForEvents(1, &result_gather);

#if bench	
	// stop GPU time measurement
	stop_timer(2);
#endif
	//check result
/*	for(size_t i = 0; i < y; ++i)
	{
		printf("%f ", result[i]);
	}
	printf("\n");
  */  

#if bench
	start_timer(3);
#endif
	bool correct = validate(px, py, result, x, y);
#if bench
	stop_timer(3);
	printf("v%i; x: %lu, y: %lu\n", version, x, y);
	printf("CPU: %f, upcpy: %f DeviceCalc: %f, downcpy: %f\n", 
		   get_secs(3), get_secs(0), get_secs(1), get_secs(2));
#endif
	
	if(correct)
		printf("SUCCESS!\n");
		
	//cleenup

	clReleaseKernel(sparsedot_kernel);
	clReleaseCommandQueue(queue);
	clReleaseEvent(result_gather);
	clReleaseMemObject(px_d);
	clReleaseMemObject(py_d);
	clReleaseMemObject(result_d);
	clReleaseMemObject(pyLength_d);
//	clReleaseDevice(device);

	free(px);
#if version == 2 || version == 3
	free(px_u);
#endif
	free(py);
	free(result);

	return 0;
}
/**
 * \related cl_Mem_Object_t
 *
 * This function map OpenCL Image into Host-accessible memory & returns pointer
 * to mapped memory region
 * @param[in,out] self  pointer to structure, in which 'Map' function pointer
 * is defined to point on this function.
 * @param[in] blocking_map flag of type 'cl_bool' that denotes, should operation
 * be blocking or not.
 * @param [in] map_flags mapping flags, that denotes how memory object should be
 * mapped
 * @param[in] time_mode enumeration, that denotes how time measurement should be
 * performed
 * @param[out] evt_to_generate pointer to OpenCL event that will be generated
 * at the end of operation.
 *
 * @return pointer to Host-accessible region of memory in case of success, NULL
 * pointer otherwise. In that case function sets error value, which is available
 * through cl_Error_t structure, defined by pointer 'self->error'
 *
 * @see cl_err_codes.h for detailed error description.
 * @see 'cl_Error_t' structure for error handling.
 */
static void* Image_Map(
    scow_Mem_Object     *self, 
    cl_bool             blocking_map,
    cl_map_flags        map_flags, 
    TIME_STUDY_MODE     time_mode,
    cl_event            *evt_to_generate, 
    cl_command_queue    explicit_queue)
{
    cl_int ret;

    cl_event mapping_ready, *p_mapping_ready;

    const size_t origin[3] =
    { 0, 0, 0 }, region[3] =
    { self->width, self->height, 1 };

    OCL_CHECK_EXISTENCE(self, NULL);

    if (blocking_map > CL_TRUE)
    {
        self->error->Set_Last_Code(self->error, INVALID_BLOCKING_FLAG);
        return NULL;
    }

    (evt_to_generate != NULL) ?
            (p_mapping_ready = evt_to_generate) : (p_mapping_ready =
                    &mapping_ready);

    // We can't map the object, that is already mapped
    if (self->mapped_to_region != NULL)
    {
        self->error->Set_Last_Code(self->error, BUFFER_IN_USE);
        return VOID_MEM_OBJ_PTR;
    }

    cl_command_queue q =
            (explicit_queue == NULL) ?
                    (self->parent_thread->q_data_dtoh) : (explicit_queue);

    /* Save mapped pointer inside a structure in case if memory object is being
     * destroyed without unmapping it at first.
     */
    self->mapped_to_region = clEnqueueMapImage(q, self->cl_mem_object,
            blocking_map, map_flags, origin, region, &self->row_pitch, NULL, 0,
            NULL, p_mapping_ready, &ret);

    OCL_DIE_ON_ERROR(ret, CL_SUCCESS,
            self->error->Set_Last_Code(self->error, ret), NULL);

    switch (time_mode)
    {
    case MEASURE:
        self->timer->current_time_device = Gather_Time_uS(p_mapping_ready);
        self->timer->total_time_device += self->timer->current_time_device;
        break;

    case DONT_MEASURE:
        break;

    default:
        break;
    }

    if (p_mapping_ready != evt_to_generate){
        clReleaseEvent(*p_mapping_ready);
    }

    return self->mapped_to_region;
}
Example #28
0
void OCLAcceleratorMatrixHYB<ValueType>::Apply(const BaseVector<ValueType> &in, BaseVector<ValueType> *out) const {

  if (this->get_nnz() > 0) {
    
    assert(in.  get_size() >= 0);
    assert(out->get_size() >= 0);
    assert(in.  get_size() == this->get_ncol());
    assert(out->get_size() == this->get_nrow());
    
    
    const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; 
    OCLAcceleratorVector<ValueType> *cast_out      = dynamic_cast<      OCLAcceleratorVector<ValueType>*> (out) ; 
    
    assert(cast_in != NULL);
    assert(cast_out!= NULL);

    // ELL
    if (this->get_ell_nnz() > 0) {

      int nrow = this->get_nrow();
      int ncol = this->get_ncol();
      int max_row = this->get_ell_max_row();

      cl_int    err;
      cl_event  ocl_event;
      size_t    localWorkSize[1];
      size_t    globalWorkSize[1];

      err  = clSetKernelArg( CL_KERNEL_ELL_SPMV, 0, sizeof(int),    (void *) &nrow );
      err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 1, sizeof(int),    (void *) &ncol );
      err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 2, sizeof(int),    (void *) &max_row );
      err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 3, sizeof(cl_mem), (void *) this->mat_.ELL.col );
      err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 4, sizeof(cl_mem), (void *) this->mat_.ELL.val );
      err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 5, sizeof(cl_mem), (void *) cast_in->vec_ );
      err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 6, sizeof(cl_mem), (void *) cast_out->vec_ );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      localWorkSize[0]  = this->local_backend_.OCL_max_work_group_size;
      localWorkSize[0] /= 0.5;
      globalWorkSize[0] = ( size_t( nrow / localWorkSize[0] ) + 1 ) * localWorkSize[0];

      err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue,
                                    CL_KERNEL_ELL_SPMV,
                                    1,
                                    NULL,
                                    &globalWorkSize[0],
                                    &localWorkSize[0],
                                    0,
                                    NULL,
                                    &ocl_event);
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      err = clWaitForEvents( 1, &ocl_event );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      err = clReleaseEvent( ocl_event );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    }

    // COO
    if (this->get_coo_nnz() > 0) {

      // do not support super small matrices
      assert(this->get_coo_nnz() > OPENCL_WARPSIZE); 
      
      // ----------------------------------------------------------
      // Modified and adopted from CUSP 0.3.1, 
      // http://code.google.com/p/cusp-library/
      // NVIDIA, APACHE LICENSE 2.0
      // ----------------------------------------------------------
      // see __spmv_coo_flat(...)
      // ----------------------------------------------------------
      // CHANGELOG
      // - adopted interface
      // ----------------------------------------------------------  

      const unsigned int BLOCK_SIZE = this->local_backend_.OCL_max_work_group_size;
      //    const unsigned int MAX_BLOCKS = this->local_backend_.GPU_max_blocks;
      
      const unsigned int MAX_BLOCKS = 32; //  cusp::detail::device::arch::max_active_blocks(spmv_coo_flat_kernel<IndexType, ValueType, BLOCK_SIZE, UseCache>, BLOCK_SIZE, (size_t) 0);
      
      const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / OPENCL_WARPSIZE;
      
      
      const unsigned int num_units  = this->get_coo_nnz() / OPENCL_WARPSIZE; 
      const unsigned int num_warps  = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS);
      const unsigned int num_blocks = (num_warps + (WARPS_PER_BLOCK-1)) / WARPS_PER_BLOCK; // (N + (granularity - 1)) / granularity
      const unsigned int num_iters  = (num_units +  (num_warps-1)) / num_warps;
      
      const unsigned int interval_size = OPENCL_WARPSIZE * num_iters;
      
      const int tail = num_units * OPENCL_WARPSIZE; // do the last few nonzeros separately (fewer than this->local_backend_.GPU_wrap elements)
      
      const unsigned int active_warps = (interval_size == 0) ? 0 : ((tail + (interval_size-1))/interval_size);

      cl_mem *temp_rows = NULL;
      cl_mem *temp_vals = NULL;

      allocate_ocl<int>      (active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_rows);
      allocate_ocl<ValueType>(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_vals);

      cl_int    err;
      cl_event  ocl_event;
      size_t    localWorkSize[1];
      size_t    globalWorkSize[1];

      ValueType scalar = 1.0;

      // Set arguments for kernel call
      err  = clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 0, sizeof(int),       (void *) &tail );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 1, sizeof(int),       (void *) &interval_size );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 2, sizeof(cl_mem),    (void *) this->mat_.COO.row );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 3, sizeof(cl_mem),    (void *) this->mat_.COO.col );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 4, sizeof(cl_mem),    (void *) this->mat_.COO.val );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 5, sizeof(ValueType), (void *) &scalar );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 6, sizeof(cl_mem),    (void *) cast_in->vec_ );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 7, sizeof(cl_mem),    (void *) cast_out->vec_ );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 8, sizeof(cl_mem),    (void *) temp_rows );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 9, sizeof(cl_mem),    (void *) temp_vals );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Determine local work size for kernel call
      localWorkSize[0]  = BLOCK_SIZE;
      // Determine global work size for kernel call
      globalWorkSize[0] = num_blocks * localWorkSize[0];

      // Start kernel run
      err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue,
                                    CL_KERNEL_COO_SPMV_FLAT,
                                    1,
                                    NULL,
                                    &globalWorkSize[0],
                                    &localWorkSize[0],
                                    0,
                                    NULL,
                                    &ocl_event);
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Wait for kernel run to finish
      err = clWaitForEvents( 1, &ocl_event );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Release event when kernel run finished
      err = clReleaseEvent( ocl_event );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Set arguments for kernel call
      err  = clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 0, sizeof(int),    (void *) &active_warps );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 1, sizeof(cl_mem), (void *) temp_rows );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 2, sizeof(cl_mem), (void *) temp_vals );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 3, sizeof(cl_mem), (void *) cast_out->vec_ );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Determine global work size for kernel call
      globalWorkSize[0] = localWorkSize[0];

      // Start kernel run
      err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue,
                                    CL_KERNEL_COO_SPMV_REDUCE_UPDATE,
                                    1,
                                    NULL,
                                    &globalWorkSize[0],
                                    &localWorkSize[0],
                                    0,
                                    NULL,
                                    &ocl_event);
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Wait for kernel run to finish
      err = clWaitForEvents( 1, &ocl_event );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Release event when kernel run finished
      err = clReleaseEvent( ocl_event );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      int nnz = this->get_coo_nnz();

      // Set arguments for kernel call
      err  = clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 0, sizeof(int),       (void *) &nnz );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 1, sizeof(cl_mem),    (void *) this->mat_.COO.row );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 2, sizeof(cl_mem),    (void *) this->mat_.COO.col );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 3, sizeof(cl_mem),    (void *) this->mat_.COO.val );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 4, sizeof(ValueType), (void *) &scalar );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 5, sizeof(cl_mem),    (void *) cast_in->vec_ );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 6, sizeof(cl_mem),    (void *) cast_out->vec_ );
      err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 7, sizeof(int),       (void *) &tail );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Determine local work size for kernel call
      localWorkSize[0]  = 1;
      // Determine global work size for kernel call
      globalWorkSize[0] = 1;

      // Start kernel run
      err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue,
                                    CL_KERNEL_COO_SPMV_SERIAL,
                                    1,
                                    NULL,
                                    &globalWorkSize[0],
                                    &localWorkSize[0],
                                    0,
                                    NULL,
                                    &ocl_event);
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Wait for kernel run to finish
      err = clWaitForEvents( 1, &ocl_event );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

      // Release event when kernel run finished
      err = clReleaseEvent( ocl_event );
      CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); 
      
      free_ocl(&temp_rows);
      free_ocl(&temp_vals);      

    }

  }

}
/**
 * \related cl_Mem_Object_t
 *
 * This function copies content of one OpenCL buffer memory object into another.
 *
 * @param[in,out] self  pointer to structure, in which 'Copy' function pointer
 * is defined to point on this function.
 * @param[out] dest pointer to another Memory Object structure, where the data
 * from 'self' will be copied to.
 * @param[in] blocking_flag flag, that denotes, should operation be blocking or not.
 * @param[in] time_mode enumeration, that denotes how time measurement should be
 * performed.
 * @param[out] evt_to_generate pointer to OpenCL event that will be generated
 * at the end of operation.
 *
 * @return CL_SUCCESS in case of success, error code of type 'ret_code' otherwise.
 *
 * @see cl_err_codes.h for detailed error description.
 * @see 'cl_Error_t' structure for error handling.
 */
static ret_code Buffer_Copy(
    scow_Mem_Object         *self, 
    scow_Mem_Object         *dest,
    cl_bool                 blocking_flag, 
    TIME_STUDY_MODE         time_mode,
    cl_event                *evt_to_generate, 
    cl_command_queue        explicit_queue)
{
    cl_int ret = CL_SUCCESS;

    cl_event copy_ready, *p_copy_ready = (cl_event*) 0x0;

    OCL_CHECK_EXISTENCE(self, INVALID_BUFFER_GIVEN);
    OCL_CHECK_EXISTENCE(dest, INVALID_BUFFER_GIVEN);

    // Can't copy distinct memory objects
    if (self->obj_mem_type != dest->obj_mem_type)
    {
        return DISTINCT_MEM_OBJECTS;
    }

    // Can't copy bigger object into smaller one
    if (self->size > dest->size)
    {
        return INVALID_BUFFER_SIZE;
    }

    // If src & dest are the same, no need to copy at all, just reset timer.
    if (self == dest)
    {
        self->timer->current_time_device = 0;
        return CL_SUCCESS;
    }

    (evt_to_generate == NULL) ? (p_copy_ready = &copy_ready) : (p_copy_ready =
                                        evt_to_generate);

    cl_command_queue q =
            (explicit_queue == NULL) ?
                    (self->parent_thread->q_data_dtod) : (explicit_queue);

    ret = clEnqueueCopyBuffer(q, self->cl_mem_object, dest->cl_mem_object, 0, 0,
            self->size, 0, NULL, p_copy_ready);

    OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, ret);

    switch (time_mode)
    {
    case MEASURE:
        self->timer->current_time_device = Gather_Time_uS(p_copy_ready);
        self->timer->total_time_device += self->timer->current_time_device;
        break;

    default:
        break;
    }

    if (p_copy_ready != evt_to_generate){
        clReleaseEvent(*p_copy_ready);
    }

    return ret;
}
Example #30
0
void sum_gpu(long long *in, long long *out, unsigned int n)
{
	size_t global_size;
	size_t local_size;

	char *kernel_src;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	cl_uint max_compute_units;
	size_t max_workgroup_size;

	cl_context context;
	cl_command_queue commands;
	cl_program program;
	cl_kernel kernel;
	cl_mem d_array;

	cl_event event;
	cl_ulong start, end;

	/* start OpenCL */
	err = clGetPlatformIDs(1, &platform_id,NULL);
	clErrorHandling("clGetPlatformIDs");

	err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	clErrorHandling("clGetDeviceIDs");

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	clErrorHandling("clCreateContext");

	commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	clErrorHandling("clCreateCommandQueue");

	/* create kernel */
	kernel_src = file_to_string(KERNEL_SRC);
	program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err);
	free(kernel_src);
	clErrorHandling("clCreateProgramWithSource");

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	clErrorHandling("clBuildProgram");

	kernel = clCreateKernel(program, "matrix_mult", &err);
	clErrorHandling("clCreateKernel");

	/* allocate memory and send to gpu */
	d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err);
	clErrorHandling("clCreateBuffer");

	err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL);
	clErrorHandling("clEnqueueWriteBuffer");

	err  = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
	err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL);
	clErrorHandling("clGetDeviceInfo");

	/* prepare kernel args */
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array);
	err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n);

	/* execute */
	local_size = n / max_compute_units / 8;
	if (local_size > max_workgroup_size)
		local_size = max_workgroup_size;

	/*
	 *	Usually it would be
	 *	global_size = local_size * max_compute_units;
	 *	but that would only be valid if local_size = n / max_compute_units;
	 *	local_size is n / max_compute_units / 8 because it obtains its hightest performance.
	 */
	for (global_size = local_size; global_size < n; global_size += local_size);

	err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event);
	clErrorHandling("clEnqueueNDRangeKernel");

	clWaitForEvents(1, &event);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
	fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0);

	err = clFinish(commands);
	clErrorHandling("clFinish");

	/* transfer back */
	err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long
	clErrorHandling("clEnqueueReadBuffer");

	/* cleanup*/
	clReleaseMemObject(d_array);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);
	clReleaseEvent(event);
}