Ejemplo n.º 1
0
int
main(void){
  cl_int err;

  cl_context context;
  cl_device_id did;
  cl_command_queue queue;

  CHECK_CL_ERROR(poclu_get_any_device(&context, &did, &queue));
  TEST_ASSERT( context );
  TEST_ASSERT( did );
  TEST_ASSERT( queue );

  size_t program_size = strlen(program_src);
  char* program_buffer = program_src;

  cl_program program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer,
                                     &program_size, &err);
  //clCreateProgramWithSource for the program with #include failed
  CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource");

  err = clBuildProgram(program, 1, &did, NULL, NULL, NULL);
  TEST_ASSERT(err == CL_BUILD_PROGRAM_FAILURE);

  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
  CHECK_CL_ERROR (clReleaseProgram (program));
  CHECK_CL_ERROR (clReleaseContext (context));

  CHECK_CL_ERROR (clUnloadCompiler ());

  return EXIT_SUCCESS;
}
Ejemplo n.º 2
0
static int get_gpu_device_id(cl_platform_id platform_id,
                             cl_device_id *device_out,
                             cl_bool fallback,
                             cl_int *err)
{
    cl_int _err = CL_SUCCESS;

    assert(device_out != NULL);

    if (!err) err = &_err;

    /* TODO: multi-gpu / multi-device? */
    *err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, device_out,
                          NULL);
    if (*err == CL_DEVICE_NOT_FOUND && !fallback) {
        ERROR("No GPU devices found", 0);
        goto error;
    } else if (*err != CL_DEVICE_NOT_FOUND) {
        CHECK_CL_ERROR(*err);
        return 0;
    }

    *err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
                          device_out, NULL);
    if (*err == CL_DEVICE_NOT_FOUND) {
        ERROR("No devices found", 0);
        goto error;
    }
    CHECK_CL_ERROR(*err);

    return 0;
error:
    return -1;
}
Ejemplo n.º 3
0
static int get_desired_platform(const char *substr,
                                cl_platform_id *platform_id_out,
                                cl_int *err)
{
    cl_int _err = CL_SUCCESS;
    cl_uint i, num_platforms;
    cl_platform_id *platform_ids = NULL;
    char *platform_name = NULL;

    assert(platform_id_out != NULL);

    if (!err) err = &_err;

    *err = clGetPlatformIDs(0, NULL, &num_platforms);
    CHECK_CL_ERROR(*err);

    platform_ids = malloc(sizeof(*platform_ids) * num_platforms);
    CHECK_ALLOCATION(platform_ids);

    *err = clGetPlatformIDs(num_platforms, platform_ids, NULL);
    CHECK_CL_ERROR(*err);

    for (i = 0; i < num_platforms; i++) {
        size_t platform_name_size;

        *err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 0, NULL,
                                 &platform_name_size);
        CHECK_CL_ERROR(*err);

        platform_name = realloc(platform_name,
                                sizeof(*platform_name) * platform_name_size);
        CHECK_ALLOCATION(platform_name);

        *err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME,
                                 platform_name_size, platform_name, NULL);
        CHECK_CL_ERROR(*err);

        if (DEBUG)
            printf("Platform %u: \"%s\"\n", i, platform_name);

        if (strstr(platform_name, substr))
            break;
    }

    if (i < num_platforms)
        *platform_id_out = platform_ids[i];
    else
        goto error; /* No platforms found */

    free(platform_ids);
    free(platform_name);
    return 0;
error:
    free(platform_ids);
    free(platform_name);
    return -1;
}
Ejemplo n.º 4
0
int main()
{
  cl_int err;
  cl_event user_evt = NULL;
  int i;

  // An user event can be set to either complete or a negative value, indicating error;
  // additionally, no objects involved in a command that waits on the user event should
  // be released before the event status is set; however, it should be possible to release
  // everything even if the status is set to something which is NOT CL_COMPLETE. So
  // try both CL_COMPLETE and a negative value
  cl_int status[] = {CL_INVALID_EVENT, CL_COMPLETE };

  // We also query for profiling info of the event, which according to the standard
  // should return CL_PROFILING_INFO_NOT_AVAILABLE
  cl_ulong queued, submitted, started, endtime;

  for (i = 0; i < ARRAY_SIZE(status); ++i) {
	  cl_context context;
	  cl_command_queue queue;
	  cl_device_id device;

	  CHECK_CL_ERROR(poclu_get_any_device(&context, &device, &queue));
	  TEST_ASSERT( context );
	  TEST_ASSERT( device );
	  TEST_ASSERT( queue );

	  user_evt = clCreateUserEvent(context, &err);
	  CHECK_OPENCL_ERROR_IN("clCreateUserEvent");
	  TEST_ASSERT( user_evt );

	  CHECK_CL_ERROR(clSetUserEventStatus(user_evt, status[i]));

	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_QUEUED,
		  sizeof(queued), &queued, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_SUBMIT,
		  sizeof(submitted), &submitted, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_START,
		  sizeof(started), &started, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_END,
		  sizeof(endtime), &endtime, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);

	  CHECK_CL_ERROR(clReleaseEvent(user_evt));
	  CHECK_CL_ERROR(clReleaseCommandQueue(queue));
	  CHECK_CL_ERROR(clReleaseContext(context));
  }

  return EXIT_SUCCESS;

}
Ejemplo n.º 5
0
cl_kernel kernel_from_string(cl_context ctx,
    char const *knl, char const *knl_name, char const *options)
{
  // create an OpenCL program (may have multiple kernels)
  size_t sizes[] = { strlen(knl) };

  cl_int status;
  cl_program program = clCreateProgramWithSource(ctx, 1, &knl, sizes, &status);
  CHECK_CL_ERROR(status, "clCreateProgramWithSource");

  // build it
  status = clBuildProgram(program, 0, NULL, options, NULL, NULL);

  if (status != CL_SUCCESS)
  {
    // build failed, get build log and print it

    cl_device_id dev;
    CALL_CL_GUARDED(clGetProgramInfo, (program, CL_PROGRAM_DEVICES,
          sizeof(dev), &dev, NULL));

    size_t log_size;
    CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG,
          0, NULL, &log_size));

    char *log = (char *) malloc(log_size);
    CHECK_SYS_ERROR(!log, "kernel_from_string: allocate log");

    char devname[MAX_NAME_LEN];
    CALL_CL_GUARDED(clGetDeviceInfo, (dev, CL_DEVICE_NAME,
          sizeof(devname), devname, NULL));

    CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG,
          log_size, log, NULL));
    fprintf(stderr, "*** build of '%s' on '%s' failed:\n%s\n*** (end of error)\n",
        knl_name, devname, log);
    abort();
  }
  else
    CHECK_CL_ERROR(status, "clBuildProgram");

  // fish the kernel out of the program
  cl_kernel kernel = clCreateKernel(program, knl_name, &status);
  CHECK_CL_ERROR(status, "clCreateKernel");

  CALL_CL_GUARDED(clReleaseProgram, (program));

  return kernel;
}
Ejemplo n.º 6
0
int main(int argc, char *argv[])
{
	cl_int err = CL_SUCCESS;
	cl::Event evt;

	std::vector<cl::Platform> platforms;
	cl::Platform::get(&platforms);
	if (platforms.size() == 0) {
		return false;
	}
	platform_ = platforms[0];

	cl_context_properties properties[] = 
		{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
	context_ = cl::Context(CL_DEVICE_TYPE_GPU, properties, NULL, NULL, &err); 
	CHECK_CL_ERROR(err, "cl::Context");

	std::vector<cl::Device> devices = context_.getInfo<CL_CONTEXT_DEVICES>();
	if (devices.size() == 0) {
		return false;
	}
	device_ = devices[0];

	sources_.push_back(std::make_pair(source_str.c_str(), source_str.size()));
	program_ = cl::Program(context_, sources_);
	err = program_.build(devices);
	if (err != CL_SUCCESS) {
		std::string log = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
		std::cout << "program.build() ERROR: " << log.c_str() << std::endl;
		return false;
	}

	kernel_ = cl::Kernel(program_, "hello", &err); 
	CHECK_CL_ERROR(err, "cl::Kernel");

	buf_ = cl::Buffer(context_, CL_MEM_READ_ONLY, 1024, NULL, &err);

	queue_ = cl::CommandQueue(context_, device_, 0, &err);
	CHECK_CL_ERROR(err, "cl::CommandQueue");

	kernel_.setArg(0, buf_);

	err = queue_.enqueueNDRangeKernel(kernel_, cl::NullRange, cl::NDRange(10, 10), cl::NullRange, NULL, &evt); 
	evt.wait();
	CHECK_CL_ERROR(err, "queue.enqueueNDRangeKernel()");

	return 0;
}
Ejemplo n.º 7
0
static cl_int opencl_plugin_init_voxel_buffer(opencl_plugin plugin,
                                              cl_int num_voxels)
{
    cl_int err;
    cl_mem new_voxel_buffer = NULL;

    assert(plugin != NULL);
    assert(num_voxels >= 0);

    if (num_voxels > plugin->voxel_grid_buffer_capacity) {
        /* Current buffer not big enough, free old buffer first */
        if (plugin->voxel_grid_buffer) {
            clReleaseMemObject(plugin->voxel_grid_buffer);
            plugin->voxel_grid_buffer = NULL;
        }

        plugin->voxel_grid_buffer_capacity = 0;

        /* TODO: Maybe better dynamic resizing (factor = 1.5)? */
        new_voxel_buffer =
            clCreateBuffer(plugin->context, CL_MEM_WRITE_ONLY,
                           (size_t)num_voxels, NULL, &err);
        CHECK_CL_ERROR(err);

        plugin->voxel_grid_buffer_capacity = num_voxels;
        plugin->voxel_grid_buffer = new_voxel_buffer;
        new_voxel_buffer = NULL;
    }

    return 0;
error:
    if (new_voxel_buffer)
        clReleaseMemObject(new_voxel_buffer);
    return -1;
}
void GetPlatformAndDevice(cl_platform_id & target_platform, cl_device_id & target_device)
{
    cl_platform_id* platforms;
    cl_device_id* devices;
    cl_uint count;
    cl_int error;
    size_t length;

    char *queryString;

    /* Find platform */
    error = clGetPlatformIDs(0, NULL, &count);
    CHECK_CL_ERROR(error);

    platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * count);
    clGetPlatformIDs(count, platforms, NULL);

    if (g_opencl_ctrl.platform_id >= count)
        {fprintf(stderr, "Error: Cannot find selected platform\n"); exit(1);}
    target_platform = platforms[g_opencl_ctrl.platform_id];

    /* Find device */
    error = clGetDeviceIDs(target_platform, CL_DEVICE_TYPE_ALL, 0, NULL, &count);
    CHECK_CL_ERROR(error);

    devices = (cl_device_id *)malloc(sizeof(cl_device_id) * count);
    clGetDeviceIDs(target_platform, CL_DEVICE_TYPE_ALL, count, devices, NULL);

    if (g_opencl_ctrl.device_id >= count)
        {fprintf(stderr, "Error: Cannot find selected device\n"); exit(1);}
    target_device = devices[g_opencl_ctrl.device_id];

    /* Get device name */
    error = clGetDeviceInfo(target_device, CL_DEVICE_NAME, 0, NULL, &length);
    CHECK_CL_ERROR(error);

    queryString = (char *)malloc(sizeof(char) * length);
    clGetDeviceInfo(target_device, CL_DEVICE_NAME, length, queryString, NULL);
    fprintf(stderr, "Device selected: '%s'\n", queryString);

    /* Free the space */
    free(platforms);
    free(devices);
    free(queryString);
}
Ejemplo n.º 9
0
//TODO remove this at some point (deprecated)
cl_kernel kernel_from_string(cl_context ctx, 
    char const *knl, char const *knl_name, char const *options)
{
  size_t sizes[] = { strlen(knl) };

  cl_int status;
  cl_program program = clCreateProgramWithSource(ctx, 1, &knl, sizes, &status);
  CHECK_CL_ERROR(status, "clCreateProgramWithSource");

  status = clBuildProgram(program, 0, NULL, options, NULL, NULL);

  if (status != CL_SUCCESS)
  {
    // build failed, get build log.

    cl_device_id dev;
    CALL_CL_GUARDED(clGetProgramInfo, (program, CL_PROGRAM_DEVICES,
          sizeof(dev), &dev, NULL));

    size_t log_size;
    CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG,
          0, NULL, &log_size));

    char *log = malloc(log_size);
    CHECK_SYS_ERROR(!log, "kernel_from_string: allocate log");

    char devname[100];
    CALL_CL_GUARDED(clGetDeviceInfo, (dev, CL_DEVICE_NAME,
          sizeof(devname), devname, NULL));

    CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG,
          log_size, log, NULL));
    THError("*** build of '%s' on '%s' failed:\n%s\n*** (end of error)\n",
            knl_name, devname, log);
  }
  else
    CHECK_CL_ERROR(status, "clBuildProgram");

  cl_kernel kernel = clCreateKernel(program, knl_name, &status);
  CHECK_CL_ERROR(status, "clCreateKernel");

  CALL_CL_GUARDED(clReleaseProgram, (program));

  return kernel;
}
Ejemplo n.º 10
0
void ProgramLauncher::launch(const Program *program, std::vector<Program::Parameter> &parameters, size_t dim, size_t *global_size, size_t *local_size, bool blocking) {
    if(!m_is_initialized) return;

    cl_kernel kernel    = program->getEntryPoint();

    for (size_t i = 0; i < parameters.size(); i++) {
        CHECK_CL_ERROR(clSetKernelArg (kernel, (cl_uint)i, parameters[i].size, parameters[i].value));
    }

    cl_event event;
    CHECK_CL_ERROR(clEnqueueNDRangeKernel (m_compute_context->queue(), kernel, (cl_uint)dim, NULL, global_size, local_size, 0, NULL, &event));
    //logger->log(Logger::DEBUG, "Running kernel %s with dimensions %d @ [%d,%d,%d]",
    //        program->getName().c_str(), dim, global_size[0],global_size[1],global_size[2]);

    if(blocking){
        CHECK_CL_ERROR(clFinish(m_compute_context->queue()));
    }

    PerformanceAnalyser::analyzeProgramEvent(&m_compute_context->queue(), event, program->getName());
    clReleaseEvent(event);
}
Ejemplo n.º 11
0
static int create_context(cl_platform_id platform,
                          cl_device_id device,
                          cl_context *context_out,
                          cl_int *err)
{
    cl_int _err = CL_SUCCESS;
    cl_context_properties context_properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};

    assert(context_out != NULL);

    if (!err) err = &_err;

    *context_out = clCreateContext(context_properties, 1, &device, NULL, NULL,
                                   err);
    CHECK_CL_ERROR(*err);

    return 0;
error:
    return -1;
}
Ejemplo n.º 12
0
static int enqueue_zero_buffer(cl_command_queue queue,
                               cl_mem buffer,
                               size_t size,
                               cl_uint num_events_in_wait_list,
                               const cl_event *event_wait_list,
                               cl_event *event,
                               cl_int *err)
{
    cl_int _err;
    cl_uchar c = 0;

    if (!err) err = &_err;

    *err = clEnqueueFillBuffer(queue, (cl_mem)buffer, &c, sizeof(c), 0,
                               size, num_events_in_wait_list, event_wait_list,
                               event);
    CHECK_CL_ERROR(*err);

    return 0;
error:
    return -1;
}
Ejemplo n.º 13
0
int main(int argc, char **argv)
{
  cl_int err;
  const char *krn_src;
  cl_program program;
  cl_context ctx;
  cl_command_queue queue;
  cl_device_id did;
  cl_kernel kernel;

  CHECK_CL_ERROR(poclu_get_any_device(&ctx, &did, &queue));
  TEST_ASSERT(ctx);
  TEST_ASSERT(did);
  TEST_ASSERT(queue);

  krn_src = poclu_read_file(SRCDIR "/tests/runtime/test_clCreateKernelsInProgram.cl");
  TEST_ASSERT(krn_src);

  program = clCreateProgramWithSource(ctx, 1, &krn_src, NULL, &err);
  CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource");

  CHECK_CL_ERROR(clBuildProgram(program, 0, NULL, NULL, NULL, NULL));

  kernel = clCreateKernel(program, NULL, &err);
  TEST_ASSERT(err == CL_INVALID_VALUE);
  TEST_ASSERT(kernel == NULL);

  kernel = clCreateKernel(program, "nonexistent_kernel", &err);
  TEST_ASSERT(err == CL_INVALID_KERNEL_NAME);
  TEST_ASSERT(kernel == NULL);

  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
  CHECK_CL_ERROR (clReleaseProgram (program));
  CHECK_CL_ERROR (clReleaseContext (ctx));
  CHECK_CL_ERROR (clUnloadCompiler ());

  free ((void *)krn_src);

  printf("OK\n");

  return 0;
}
Ejemplo n.º 14
0
int main(int argc, char **argv)
{
  /* test name */
  char name[] = "test_image_query_funcs";
  size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 };
  size_t srcdir_length, name_length, filename_size;
  char *filename = NULL;
  char *source = NULL;
  cl_device_id devices[1];
  cl_context context = NULL;
  cl_command_queue queue = NULL;
  cl_program program = NULL;
  cl_kernel kernel = NULL;
  cl_int err;

  /* image parameters */
  cl_uchar4 *imageData;
  cl_image_format image_format;
  cl_image_desc image2_desc, image3_desc;

  printf("Running test %s...\n", name);

  memset(&image2_desc, 0, sizeof(cl_image_desc));
  image2_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
  image2_desc.image_width = 2;
  image2_desc.image_height = 4;

  memset(&image3_desc, 0, sizeof(cl_image_desc));
  image3_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
  image3_desc.image_width = 2;
  image3_desc.image_height = 4;
  image3_desc.image_depth = 8;

  image_format.image_channel_order = CL_RGBA;
  image_format.image_channel_data_type = CL_UNSIGNED_INT8;
  imageData = (cl_uchar4*)malloc (4 * 4 * sizeof(cl_uchar4));

  TEST_ASSERT (imageData != NULL && "out of host memory\n");
  memset (imageData, 1, 4*4*sizeof(cl_uchar4));

  /* determine file name of kernel source to load */
  srcdir_length = strlen(SRCDIR);
  name_length = strlen(name);
  filename_size = srcdir_length + name_length + 16;
  filename = (char *)malloc(filename_size + 1);
  TEST_ASSERT (filename != NULL && "out of host memory\n");

  snprintf(filename, filename_size, "%s/%s.cl", SRCDIR, name);

  /* read source code */
  source = poclu_read_file (filename);
  TEST_ASSERT (source != NULL && "Kernel .cl not found.");

  /* setup an OpenCL context and command queue using default device */
  context = poclu_create_any_context();
  TEST_ASSERT (context != NULL && "clCreateContextFromType call failed\n");

  cl_sampler external_sampler = clCreateSampler (
      context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &err);
  CHECK_OPENCL_ERROR_IN ("clCreateSampler");

  CHECK_CL_ERROR (clGetContextInfo (context, CL_CONTEXT_DEVICES,
                                    sizeof (cl_device_id), devices, NULL));

  queue = clCreateCommandQueue (context, devices[0], 0, &err);
  CHECK_OPENCL_ERROR_IN ("clCreateCommandQueue");

  /* Create image */
  cl_mem image2
      = clCreateImage (context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                       &image_format, &image2_desc, imageData, &err);
  CHECK_OPENCL_ERROR_IN ("clCreateImage image2");

  cl_mem image3
      = clCreateImage (context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                       &image_format, &image3_desc, imageData, &err);
  CHECK_OPENCL_ERROR_IN ("clCreateImage image3");

  unsigned color[4] = { 2, 9, 11, 7 };
  size_t orig[3] = { 0, 0, 0 };
  size_t reg[3] = { 2, 4, 1 };
  err = clEnqueueFillImage (queue, image2, color, orig, reg, 0, NULL, NULL);
  CHECK_OPENCL_ERROR_IN ("clCreateImage image3");

  /* create and build program */
  program = clCreateProgramWithSource (context, 1, (const char **)&source,
                                       NULL, &err);
  CHECK_OPENCL_ERROR_IN ("clCreateProgramWithSource");

  err = clBuildProgram (program, 0, NULL, NULL, NULL, NULL);
  CHECK_OPENCL_ERROR_IN ("clBuildProgram");

  /* execute the kernel with give name */
  kernel = clCreateKernel (program, name, NULL);
  CHECK_OPENCL_ERROR_IN ("clCreateKernel");

  err = clSetKernelArg (kernel, 0, sizeof (cl_mem), &image2);
  CHECK_OPENCL_ERROR_IN ("clSetKernelArg 0");

  err = clSetKernelArg (kernel, 1, sizeof (cl_mem), &image3);
  CHECK_OPENCL_ERROR_IN ("clSetKernelArg 1");

  err = clSetKernelArg (kernel, 2, sizeof (cl_sampler), &external_sampler);
  CHECK_OPENCL_ERROR_IN ("clSetKernelArg 2");

  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, global_work_size,
                                local_work_size, 0, NULL, NULL);
  CHECK_OPENCL_ERROR_IN ("clEnqueueNDRangeKernel");

  err = clFinish (queue);
  CHECK_OPENCL_ERROR_IN ("clFinish");

  clReleaseMemObject (image2);
  clReleaseMemObject (image3);
  clReleaseKernel (kernel);
  clReleaseProgram (program);
  clReleaseCommandQueue (queue);
  clReleaseSampler (external_sampler);
  clUnloadCompiler ();
  clReleaseContext (context);
  free (source);
  free (filename);
  free (imageData);

  printf("OK\n");
  return 0;
}
Ejemplo n.º 15
0
int test_context(cl_context ctx, const char *prog_src, int mul,
  int ndevs, cl_device_id *devs) {
  cl_int err;
  cl_command_queue queue[ndevs];
  cl_program prog;
  cl_kernel krn;
  cl_mem buf;
  cl_event evt[ndevs];
  cl_int i;

  prog = clCreateProgramWithSource(ctx, 1, &prog_src, NULL, &err);
  CHECK_OPENCL_ERROR_IN("create program");

  CHECK_CL_ERROR(clBuildProgram(prog, 0, NULL, NULL, NULL, NULL));

  krn = clCreateKernel(prog, "setidx", &err);
  CHECK_OPENCL_ERROR_IN("create kernel");

  buf = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE |
    CL_MEM_HOST_READ_ONLY, ndevs*sizeof(cl_int), NULL, &err);
  CHECK_OPENCL_ERROR_IN("create buffer");

  CHECK_CL_ERROR(clSetKernelArg(krn, 0, sizeof(cl_mem), &buf));

  /* create one queue per device, and submit task, waiting for all
   * previous */
  for (i = 0; i < ndevs; ++i) {
    queue[i] = clCreateCommandQueue(ctx, devs[i], 0, &err);
    CHECK_OPENCL_ERROR_IN("create queue");
    err = clSetKernelArg(krn, 1, sizeof(i), &i);
    CHECK_OPENCL_ERROR_IN("set kernel arg 1");
    // no wait list for first (root) device
    err = clEnqueueTask(queue[i], krn, i, i ? evt : NULL, evt + i);
    CHECK_OPENCL_ERROR_IN("submit task");
  }

  /* enqueue map on last */
  cl_int *buf_host = clEnqueueMapBuffer(queue[ndevs - 1], buf, CL_TRUE,
    CL_MAP_READ, 0, ndevs*sizeof(cl_int), ndevs, evt, NULL, &err);
  CHECK_OPENCL_ERROR_IN("map buffer");

  int mismatch = 0;
  for (i = 0; i < ndevs; ++i) {
    mismatch += !!(buf_host[i] != i*mul);
  }
  TEST_ASSERT(mismatch == 0);

  /* enqueue unmap on first */
  CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue[0], buf, buf_host, 0, NULL, NULL));

  for (i = 0 ; i < ndevs; ++i) {
    err = clFinish(queue[i]);
    err |= clReleaseCommandQueue(queue[i]);
    err |= clReleaseEvent(evt[i]);
  }

  err |= clReleaseKernel(krn);
  err |= clReleaseMemObject(buf);
  err |= clReleaseProgram(prog);
  err |= clReleaseContext(ctx);

  CHECK_OPENCL_ERROR_IN("cleanup");

  return CL_SUCCESS;

}
Ejemplo n.º 16
0
int main(int argc, char **argv)
{
  cl_context ctx;
  cl_command_queue q;
  // root device, all devices
#define NUMDEVS 6
  cl_device_id rootdev, alldevs[NUMDEVS];
  // pointers to the sub devices of the partitions EQUALLY and BY_COUNTS
  // respectively
  cl_device_id
    *eqdev = alldevs + 1,
    *countdev = alldevs + 4;
  cl_uint max_cus, max_subs, split;
  cl_uint i, j;

  cl_int err = poclu_get_any_device(&ctx, &rootdev, &q);
  CHECK_OPENCL_ERROR_IN("poclu_get_any_device");
  TEST_ASSERT( ctx );
  TEST_ASSERT( rootdev );
  TEST_ASSERT( q );

  alldevs[0] = rootdev;

  err = clGetDeviceInfo(rootdev, CL_DEVICE_MAX_COMPUTE_UNITS,
    sizeof(max_cus), &max_cus, NULL);
  CHECK_OPENCL_ERROR_IN("CL_DEVICE_MAX_COMPUTE_UNITS");
  if (max_cus < 2)
    {
      printf("This test requires a cl device with at least 2 compute units"
             " (a dual-core or better CPU)\n");
      return 1;
    }

  err = clGetDeviceInfo(rootdev, CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
    sizeof(max_subs), &max_subs, NULL);
  CHECK_OPENCL_ERROR_IN("CL_DEVICE_PARTITION_MAX_SUB_DEVICES");

  // test fails without possible sub-devices, e.g. with basic pocl device
  TEST_ASSERT(max_subs > 1);

  cl_device_partition_property *dev_pt;
  size_t dev_pt_size;

  err = clGetDeviceInfo(rootdev, CL_DEVICE_PARTITION_PROPERTIES,
    0, NULL, &dev_pt_size);
  CHECK_OPENCL_ERROR_IN("CL_DEVICE_PARTITION_PROPERTIES size");

  dev_pt = malloc(dev_pt_size);
  TEST_ASSERT(dev_pt);
  err = clGetDeviceInfo(rootdev, CL_DEVICE_PARTITION_PROPERTIES,
    dev_pt_size, dev_pt, NULL);
  CHECK_OPENCL_ERROR_IN("CL_DEVICE_PARTITION_PROPERTIES");

  j = dev_pt_size / sizeof (*dev_pt); // number of partition types

  // check that partition types EQUALLY and BY_COUNTS are supported
  int found = 0;
  for (i = 0; i < j; ++i)
    {
      if (dev_pt[i] == CL_DEVICE_PARTITION_EQUALLY
          || dev_pt[i] == CL_DEVICE_PARTITION_BY_COUNTS)
        ++found;
    }

  TEST_ASSERT(found == 2);

  // here we will store the partition types returned by the subdevices
  cl_device_partition_property *ptype = NULL;
  size_t ptype_size;
  cl_uint numdevs = 0;

  cl_device_id parent;
  cl_uint sub_cus;

  /* CL_DEVICE_PARTITION_EQUALLY */

  printf("Max CUs: %u\n", max_cus);

  /* if the device has 3 CUs, 3 subdevices will be created, otherwise 2. */
  if (max_cus == 3)
    split = 3;
  else
    split = 2;

  const cl_device_partition_property equal_splitter[] = {
    CL_DEVICE_PARTITION_EQUALLY, max_cus/split, 0 };

  err = clCreateSubDevices(rootdev, equal_splitter, 0, NULL, &numdevs);
  CHECK_OPENCL_ERROR_IN("count sub devices");
  TEST_ASSERT(numdevs == split);

  err = clCreateSubDevices(rootdev, equal_splitter, split, eqdev, NULL);
  CHECK_OPENCL_ERROR_IN("partition equally");
  if (split == 2)
     eqdev[2] = NULL;

  cl_uint refc;
  err = clGetDeviceInfo (eqdev[0], CL_DEVICE_REFERENCE_COUNT, sizeof (refc),
                         &refc, NULL);
  CHECK_OPENCL_ERROR_IN ("get refcount");
  TEST_ASSERT (refc == 1);

  /* First, check that the root device is untouched */

  err = clGetDeviceInfo(rootdev, CL_DEVICE_MAX_COMPUTE_UNITS,
    sizeof(sub_cus), &sub_cus, NULL);
  CHECK_OPENCL_ERROR_IN("parenty CU");
  TEST_ASSERT(sub_cus == max_cus);

  err = clGetDeviceInfo(rootdev, CL_DEVICE_PARENT_DEVICE,
    sizeof(parent), &parent, NULL);
  CHECK_OPENCL_ERROR_IN("root parent device");
  TEST_ASSERT(parent == NULL);

  /* partition type may either be NULL or contain a 0 entry */
  err = clGetDeviceInfo(rootdev, CL_DEVICE_PARTITION_TYPE,
    0, NULL, &ptype_size);
  CHECK_OPENCL_ERROR_IN("root partition type");

  if (ptype_size != 0) {
    /* abuse dev_pt which should be large enough */
    TEST_ASSERT(ptype_size == sizeof(cl_device_partition_property));
    TEST_ASSERT(ptype_size <= dev_pt_size);
    err = clGetDeviceInfo(rootdev, CL_DEVICE_PARTITION_TYPE,
      ptype_size, dev_pt, NULL);
    CHECK_OPENCL_ERROR_IN("root partition type #2");
    TEST_ASSERT(dev_pt[0] == 0);
  }

  /* now test the subdevices */
  for (i = 0; i < split; ++i) {
    err = clGetDeviceInfo(eqdev[i], CL_DEVICE_MAX_COMPUTE_UNITS,
      sizeof(sub_cus), &sub_cus, NULL);
    CHECK_OPENCL_ERROR_IN("sub CU");
    TEST_ASSERT(sub_cus == max_cus/split);

    err = clGetDeviceInfo(eqdev[i], CL_DEVICE_PARENT_DEVICE,
      sizeof(parent), &parent, NULL);
    CHECK_OPENCL_ERROR_IN("sub parent device");
    TEST_ASSERT(parent == rootdev);

    err = clGetDeviceInfo(eqdev[i], CL_DEVICE_PARTITION_TYPE,
      0, NULL, &ptype_size);
    CHECK_OPENCL_ERROR_IN("sub partition type");
    TEST_ASSERT(ptype_size == sizeof(equal_splitter));

    ptype = malloc(ptype_size);
    TEST_ASSERT(ptype);
    err = clGetDeviceInfo(eqdev[i], CL_DEVICE_PARTITION_TYPE,
      ptype_size, ptype, NULL);
    CHECK_OPENCL_ERROR_IN("sub partition type #2");

    TEST_ASSERT(memcmp(ptype, equal_splitter, ptype_size) == 0);

    /* free the partition type */
    free(ptype) ; ptype = NULL;
  }

  /* CL_DEVICE_PARTITION_BY_COUNTS */

  /* Note that the platform will only read this to the first 0,
   * which is actually CL_DEVICE_PARTITION_BY_COUNTS_LIST_END;
   * the test is structured with an additional final 0 intentionally,
   * to follow the Khoronos doc example
   */
  const cl_device_partition_property count_splitter[] = {
    CL_DEVICE_PARTITION_BY_COUNTS, 1, max_cus - 1,
    CL_DEVICE_PARTITION_BY_COUNTS_LIST_END, 0 };

  err = clCreateSubDevices(rootdev, count_splitter, 0, NULL, &numdevs);
  CHECK_OPENCL_ERROR_IN("count sub devices");
  TEST_ASSERT(numdevs == 2);

  err = clCreateSubDevices(rootdev, count_splitter, 2, countdev, NULL);
  CHECK_OPENCL_ERROR_IN("partition by counts");

  /* First, check that the root device is untouched */

  err = clGetDeviceInfo(rootdev, CL_DEVICE_MAX_COMPUTE_UNITS,
    sizeof(sub_cus), &sub_cus, NULL);
  CHECK_OPENCL_ERROR_IN("parenty CU");
  TEST_ASSERT(sub_cus == max_cus);

  err = clGetDeviceInfo(rootdev, CL_DEVICE_PARENT_DEVICE,
    sizeof(parent), &parent, NULL);
  CHECK_OPENCL_ERROR_IN("root parent device");
  TEST_ASSERT(parent == NULL);

  /* partition type may either be NULL or contain a 0 entry */
  err = clGetDeviceInfo(rootdev, CL_DEVICE_PARTITION_TYPE,
    0, NULL, &ptype_size);
  CHECK_OPENCL_ERROR_IN("root partition type");

  if (ptype_size != 0) {
    /* abuse dev_pt which should be large enough */
    TEST_ASSERT(ptype_size == sizeof(cl_device_partition_property));
    TEST_ASSERT(ptype_size <= dev_pt_size);
    err = clGetDeviceInfo(rootdev, CL_DEVICE_PARTITION_TYPE,
      ptype_size, dev_pt, NULL);
    CHECK_OPENCL_ERROR_IN("root partition type #2");
    TEST_ASSERT(dev_pt[0] == 0);
  }

  // devices might be returned in different order than the counts
  // in the count_splitter

  int found_cus[2] = {0, 0};

  /* now test the subdevices */
  for (i = 0; i < 2; ++i) {
    err = clGetDeviceInfo(countdev[i], CL_DEVICE_MAX_COMPUTE_UNITS,
      sizeof(sub_cus), &sub_cus, NULL);
    CHECK_OPENCL_ERROR_IN("sub CU");
    if (sub_cus == count_splitter[1])
        found_cus[0] += 1;
    else if (sub_cus == count_splitter[2])
        found_cus[1] += 1;

    err = clGetDeviceInfo(countdev[i], CL_DEVICE_PARENT_DEVICE,
      sizeof(parent), &parent, NULL);
    CHECK_OPENCL_ERROR_IN("sub parent device");
    TEST_ASSERT(parent == rootdev);

    /* The partition type returned is up to the first 0,
     * which happens to be the CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
     * not the final terminating 0 in count_splitter, so it has one less
     * element. It should be otherwise equal */
    err = clGetDeviceInfo(countdev[i], CL_DEVICE_PARTITION_TYPE,
      0, NULL, &ptype_size);
    CHECK_OPENCL_ERROR_IN("sub partition type");
    TEST_ASSERT(ptype_size == sizeof(count_splitter) - sizeof(*count_splitter));

    ptype = malloc(ptype_size);
    TEST_ASSERT(ptype);
    err = clGetDeviceInfo(countdev[i], CL_DEVICE_PARTITION_TYPE,
      ptype_size, ptype, NULL);
    CHECK_OPENCL_ERROR_IN("sub partition type #2");

    TEST_ASSERT(memcmp(ptype, count_splitter, ptype_size) == 0);

    /* free the partition type */
    free(ptype) ; ptype = NULL;
  }

  /* the previous loop finds 1+1 subdevices only on >dual core systems;
   * on dual cores, the count_splitter is [1, 1] and the above
   * "(sub_cus == count_splitter[x])" results in 2+0 subdevices found */
  if (max_cus > 2)
    TEST_ASSERT(found_cus[0] == 1 && found_cus[1] == 1);
  else
    TEST_ASSERT((found_cus[0] + found_cus[1]) == 2);

  /* So far, so good. Let's now try and use these devices,
   * by building a program for all of them and launching kernels on them.
   *
   * Note that there's a discrepancy in behavior between implementations:
   * some assume you can treat sub-devices as their parent device, and thus
   * e.g. using them through any context which includes their parent devices,
   * other fail miserably if you try this.
   *
   * For the time being we will test the stricter behavior, where
   * sub-devices should be added manually to a context.
   */

  err = clReleaseCommandQueue(q);
  CHECK_OPENCL_ERROR_IN("clReleaseCommandQueue");
  err = clReleaseContext(ctx);
  CHECK_OPENCL_ERROR_IN("clReleaseContext");

  /* if we split into 2 equal parts, third pointer is NULL. Let's copy the
   * previous device to it */
  if (split == 2)
    eqdev[2] = eqdev[1];

  ctx = clCreateContext(NULL, NUMDEVS, alldevs, NULL, NULL, &err);
  CHECK_OPENCL_ERROR_IN("clCreateContext");
  TEST_ASSERT( test_context(ctx, prog_src_all, 1, NUMDEVS, alldevs) == CL_SUCCESS );

  ctx = clCreateContext(NULL, NUMDEVS - 1, alldevs + 1, NULL, NULL, &err);
  CHECK_OPENCL_ERROR_IN("clCreateContext");
  TEST_ASSERT( test_context(ctx, prog_src_two, -1, NUMDEVS - 1, alldevs + 1)
    == CL_SUCCESS );

  /* Don't release the same device twice. clReleaseDevice(NULL) should return
   * an error but not crash. */
  if (split == 2)
    eqdev[2] = NULL;

  for (i = 0; i < NUMDEVS; i++)
    clReleaseDevice (alldevs[i]);

  CHECK_CL_ERROR (clUnloadCompiler ());
  free (dev_pt);

  printf ("OK\n");

  return 0;
}
Ejemplo n.º 17
0
int
main(void)
{
  cl_int err;
  cl_platform_id platforms[MAX_PLATFORMS];
  cl_uint nplatforms;
  cl_device_id devices[MAX_DEVICES];
  cl_uint ndevices;
  cl_uint i, j;
  size_t el, row, col;

  CHECK_CL_ERROR(clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms));

  for (i = 0; i < nplatforms; i++)
  {
    CHECK_CL_ERROR(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, MAX_DEVICES,
      devices, &ndevices));

    /* Only test the devices we actually have room for */
    if (ndevices > MAX_DEVICES)
      ndevices = MAX_DEVICES;

    for (j = 0; j < ndevices; j++)
    {
      /* skip devices that do not support images */
      cl_bool has_img;
      CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE_SUPPORT, sizeof(has_img), &has_img, NULL));
      if (!has_img)
        continue;

      cl_context context = clCreateContext(NULL, 1, &devices[j], NULL, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clCreateContext");
      cl_command_queue queue = clCreateCommandQueue(context, devices[j], 0, &err);
      CHECK_OPENCL_ERROR_IN("clCreateCommandQueue");

      cl_ulong alloc;
      size_t max_height;
      size_t max_width;
#define MAXALLOC (1024U*1024U)

      CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE,
          sizeof(alloc), &alloc, NULL));
      CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_WIDTH,
          sizeof(max_width), &max_width, NULL));
      CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_HEIGHT,
          sizeof(max_height), &max_height, NULL));


      while (alloc > MAXALLOC)
        alloc /= 2;

      // fit at least one max_width inside the alloc (shrink max_width for this)
      while (max_width*pixel_size > alloc)
        max_width /= 2;

      // round number of elements to next multiple of max_width elements
      const size_t nels = (alloc/pixel_size/max_width)*max_width;
      const size_t buf_size = nels*pixel_size;

      cl_image_desc img_desc;
      memset(&img_desc, 0, sizeof(img_desc));
      img_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
      img_desc.image_width = max_width;
      img_desc.image_height = nels/max_width;
      img_desc.image_depth = 1;

      cl_ushort null_pixel[4] = {0, 0, 0, 0};
      cl_ushort *host_buf = malloc(buf_size);
      TEST_ASSERT(host_buf);

      for (el = 0; el < nels; el+=4) {
        host_buf[el] = el & CHANNEL_MAX;
        host_buf[el+1] = (CHANNEL_MAX - el) & CHANNEL_MAX;
        host_buf[el+2] = (CHANNEL_MAX/((el & 1) + 1) - el) & CHANNEL_MAX;
        host_buf[el+3] = (CHANNEL_MAX - el/((el & 1) + 1)) & CHANNEL_MAX;
      }

      cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clCreateBuffer");
      cl_mem img = clCreateImage(context, CL_MEM_READ_WRITE, &img_format, &img_desc, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clCreateImage");

      CHECK_CL_ERROR(clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, buf_size, host_buf, 0, NULL, NULL));

      const size_t offset = 0;
      const size_t origin[] = {0, 0, 0};
      const size_t region[] = {img_desc.image_width, img_desc.image_height, 1};

      CHECK_CL_ERROR(clEnqueueCopyBufferToImage(queue, buf, img, offset, origin, region, 0, NULL, NULL));

      size_t row_pitch, slice_pitch;
      cl_ushort *img_map = clEnqueueMapImage(queue, img, CL_TRUE, CL_MAP_READ, origin, region,
        &row_pitch, &slice_pitch, 0, NULL, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clEnqueueMapImage");

      CHECK_CL_ERROR(clFinish(queue));

      for (row = 0; row < img_desc.image_height; ++row) {
        for (col = 0; col < img_desc.image_width; ++col) {
          cl_ushort *img_pixel = (cl_ushort*)((char*)img_map + row*row_pitch) + col*4;
          cl_ushort *buf_pixel = host_buf + (row*img_desc.image_width + col)*4;

          if (memcmp(img_pixel, buf_pixel, pixel_size) != 0)
            printf("%zu %zu %zu : %x %x %x %x | %x %x %x %x\n",
              row, col, (size_t)(buf_pixel - host_buf),
              buf_pixel[0],
              buf_pixel[1],
              buf_pixel[2],
              buf_pixel[3],
              img_pixel[0],
              img_pixel[1],
              img_pixel[2],
              img_pixel[3]);

          TEST_ASSERT(memcmp(img_pixel, buf_pixel, pixel_size) == 0);

        }
      }

      CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, img, img_map, 0, NULL, NULL));

      /* Clear the buffer, and ensure it has been cleared */
      CHECK_CL_ERROR(clEnqueueFillBuffer(queue, buf, null_pixel, sizeof(null_pixel), 0, buf_size, 0, NULL, NULL));
      cl_ushort *buf_map = clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, buf_size, 0, NULL, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clEnqueueMapBuffer");

      CHECK_CL_ERROR(clFinish(queue));

      for (el = 0; el < nels; ++el) {
#if 0 // debug
        if (buf_map[el] != 0) {
          printf("%zu/%zu => %u\n", el, nels, buf_map[el]);
        }
#endif
        TEST_ASSERT(buf_map[el] == 0);
      }

      CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, buf, buf_map, 0, NULL, NULL));

      /* Copy data from image to buffer, and check that it's again equal to the original buffer */
      CHECK_CL_ERROR(clEnqueueCopyImageToBuffer(queue, img, buf, origin, region, offset, 0, NULL, NULL));
      buf_map = clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, buf_size, 0, NULL, NULL, &err);
      CHECK_CL_ERROR(clFinish(queue));

      TEST_ASSERT(memcmp(buf_map, host_buf, buf_size) == 0);

      CHECK_CL_ERROR (
          clEnqueueUnmapMemObject (queue, buf, buf_map, 0, NULL, NULL));
      CHECK_CL_ERROR (clFinish (queue));

      free(host_buf);
      CHECK_CL_ERROR (clReleaseMemObject (img));
      CHECK_CL_ERROR (clReleaseMemObject (buf));
      CHECK_CL_ERROR (clReleaseCommandQueue (queue));
      CHECK_CL_ERROR (clReleaseContext (context));
    }
  }
  return EXIT_SUCCESS;
}
Ejemplo n.º 18
0
void create_context_on(const char *plat_name, const char*dev_name, cl_uint idx,
    cl_context *ctx, cl_command_queue *queue, int enable_profiling)
{
  cl_uint plat_count;

  CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count));

  cl_platform_id *platforms = 
    (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id));
  CHECK_SYS_ERROR(!platforms, "allocating platform array");

  CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL));
  for (cl_uint i = 0; i < plat_count; ++i)
  {
    char buf[100];
    CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, 
          sizeof(buf), buf, NULL));

    if (!plat_name || strstr(buf, plat_name))
    {
      cl_uint dev_count;
      CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
            0, NULL, &dev_count));

      cl_device_id *devices = 
        (cl_device_id *) malloc(dev_count*sizeof(cl_device_id));
      CHECK_SYS_ERROR(!devices, "allocating device array");

      CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
            dev_count, devices, NULL));

      for (cl_uint j = 0; j < dev_count; ++j)
      {
        char buf[100];
        CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME,
              sizeof(buf), buf, NULL));
        if (!dev_name || strstr(buf, dev_name))
        {
          if (idx == 0)
          {
            cl_platform_id plat = platforms[i];
            cl_device_id dev = devices[j];

            free(devices);
            free(platforms);

            cl_context_properties cps[3] = { 
              CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 };

            cl_int status;
            *ctx = clCreateContext(
                cps, 1, &dev, NULL, NULL, &status);
            CHECK_CL_ERROR(status, "clCreateContext");


            cl_command_queue_properties qprops = 0;
            if (enable_profiling)
              qprops |= CL_QUEUE_PROFILING_ENABLE;

            *queue = clCreateCommandQueue(*ctx, dev, qprops, &status);
            CHECK_CL_ERROR(status, "clCreateCommandQueue");

            return;
          }
          else
            --idx;
        }
      }

      free(devices);
    }
  }

  free(platforms);

  fputs("create_context_on: specified device not found.\n", stderr);
  abort();
}
Ejemplo n.º 19
0
int main(int argc, char *argv[])
{
  int error, xsize, ysize, rgb_max;
  int *r, *b, *g;

  float *gray, *congray, *congray_cl;

  // identity kernel
  // float filter[] = {
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,1,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  // };

  // 45 degree motion blur
  float filter[] =
    {0,      0,      0,      0,      0, 0.0145,      0,
     0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
     0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
     0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
     0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
     0, 0.0145,      0,      0,      0,      0,      0};

  // mexican hat kernel
  // float filter[] = {
  //   0, 0,-1,-1,-1, 0, 0,
  //   0,-1,-3,-3,-3,-1, 0,
  //  -1,-3, 0, 7, 0,-3,-1,
  //  -1,-3, 7,24, 7,-3,-1,
  //  -1,-3, 0, 7, 0,-3,-1,
  //   0,-1,-3,-3,-3,-1, 0,
  //   0, 0,-1,-1,-1, 0, 0
  // };


  if(argc != 3)
  {
    fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]);
    abort();
  }

  const char* filename = argv[1];
  const int num_loops = atoi(argv[2]);


  // --------------------------------------------------------------------------
  // load image
  // --------------------------------------------------------------------------
  printf("Reading ``%s''\n", filename);
  ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b);
  printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize);

  // --------------------------------------------------------------------------
  // allocate CPU buffers
  // --------------------------------------------------------------------------
  posix_memalign((void**)&gray, 32, xsize*ysize*sizeof(float));
  if(!gray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray, 32, xsize*ysize*sizeof(float));
  if(!congray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray_cl, 32, xsize*ysize*sizeof(float));
  if(!congray_cl) { fprintf(stderr, "alloc gray"); abort(); }

  // --------------------------------------------------------------------------
  // convert image to grayscale
  // --------------------------------------------------------------------------
  for(int n = 0; n < xsize*ysize; ++n)
    gray[n] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max;

  // --------------------------------------------------------------------------
  // execute filter on cpu
  // --------------------------------------------------------------------------
  for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i)
  {
    for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j)
    {
      float sum = 0;
      for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k)
      {
        for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l)
        {
          sum += gray[(i+k)*xsize + (j+l)] *
            filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
        }
      }
      congray[i*xsize + j] = sum;
    }
  }

  // --------------------------------------------------------------------------
  // output cpu filtered image
  // --------------------------------------------------------------------------
  printf("Writing cpu filtered image\n");
  for(int n = 0; n < xsize*ysize; ++n)
    r[n] = g[n] = b[n] = (int)(congray[n] * rgb_max);
  error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // get an OpenCL context and queue
  // --------------------------------------------------------------------------
  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);
  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("convolution.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution", NULL);
  free(knl_text);

#ifdef NON_OPTIMIZED
  int deviceWidth = xsize;
#else
  int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX;
#endif
  int deviceHeight = ysize;
  size_t deviceDataSize = deviceHeight*deviceWidth*sizeof(float);

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
      deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        deviceDataSize, gray, 0, NULL, NULL));
#else
  size_t buffer_origin[3] = {0,0,0};
  size_t host_origin[3] = {0,0,0};
  size_t region[3] = {deviceWidth*sizeof(float), ysize, 1};
  clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE,
                           buffer_origin, host_origin, region,
                           deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
                           gray, 0, NULL, NULL);
#endif

  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0,
        FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL));

  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  cl_int rows = ysize;
  cl_int cols = xsize;
  cl_int filterWidth = FILTER_WIDTH;
  cl_int paddingPixels = 2*HALF_FILTER_WIDTH;

  size_t local_size[] = { WGX, WGY };
  size_t global_size[] = {
    ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0],
    ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1],
  };

  cl_int localWidth = local_size[0] + paddingPixels;
  cl_int localHeight = local_size[1] + paddingPixels;
  size_t localMemSize = localWidth * localHeight * sizeof(float);

  CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray));
  CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray));
  CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter));
  CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows));
  CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols));
  CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth));
  CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL));
  CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight));
  CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth));

  // --------------------------------------------------------------------------
  // print kernel info
  // --------------------------------------------------------------------------
  print_kernel_info(queue, knl);

  CALL_CL_SAFE(clFinish(queue));
  timestamp_type tic, toc;
  get_timestamp(&tic);
  for(int loop = 0; loop < num_loops; ++loop)
  {
    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL,
          global_size, local_size, 0, NULL, NULL));

    // Edit: Copy the blurred image to input buffer
#ifdef NON_OPTIMIZED
    CALL_CL_SAFE(clEnqueueCopyBuffer(queue, buf_congray, buf_gray, 0, 0,
          deviceDataSize, 0, NULL, NULL));
#else
    clEnqueueCopyBufferRect(queue, buf_congray, buf_gray,
                            buffer_origin, host_origin, region,
                            deviceWidth*sizeof(float), 0,
                            xsize*sizeof(float), 0,
                            0, NULL, NULL);
#endif
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops;
  printf("%f s\n", elapsed);
  printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed);
  printf("%f GBit/s\n", 2*xsize*ysize*sizeof(float)/1e9/elapsed);
  printf("%f GFlop/s\n", (xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH)
	 *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed);

  // --------------------------------------------------------------------------
  // transfer back & check
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueReadBuffer(
        queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        xsize * ysize * sizeof(float), congray_cl,
        0, NULL, NULL));
#else
  buffer_origin[0] = 3*sizeof(float);
  buffer_origin[1] = 3;
  buffer_origin[2] = 0;

  host_origin[0] = 3*sizeof(float);
  host_origin[1] = 3;
  host_origin[2] = 0;

  region[0] = (xsize-paddingPixels)*sizeof(float);
  region[1] = (ysize-paddingPixels);
  region[2] = 1;

  clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE,
      buffer_origin, host_origin, region,
      deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
      congray_cl, 0, NULL, NULL);
#endif

  // --------------------------------------------------------------------------
  // output OpenCL filtered image
  // --------------------------------------------------------------------------
  printf("Writing OpenCL filtered image\n");

  // Edit: Keep pixel value in the interval [0, 255] to reduce boundary effect
  for(int n = 0; n < xsize*ysize; ++n) {
    int color = (int)(congray_cl[n] * rgb_max);

    if (color < 0) {
      color = 0;
    } else if (color > 255) {
      color = 255;
    }

    r[n] = g[n] = b[n] = color;
  }
  error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_congray));
  CALL_CL_SAFE(clReleaseMemObject(buf_gray));
  CALL_CL_SAFE(clReleaseMemObject(buf_filter));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));
  free(gray);
  free(congray);
  free(congray_cl);
  free(r);
  free(b);
  free(g);
}
int main(int argc, char *argv[])
{
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem buffer;
    cl_int error;
    cl_event event;
    cl_ulong startTime, endTime;
    size_t globalSize[1], localSize[1], warpSize;
    FILE* fptr;
    unsigned long long start, end;

    void* hostData = NULL;

    /* Parse options */
    CommandParser(argc, argv);
    HostDataCreation(hostData);

    GetPlatformAndDevice(platform, device);
    fptr = fopen(g_opencl_ctrl.powerFile, "a");

    /* Create context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
    CHECK_CL_ERROR(error);

    /* Create command queue */
#ifdef USE_CL_2_0_API
    {
        cl_queue_properties property[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
        command_queue = clCreateCommandQueueWithProperties(context, device, property, &error);
    }
#else
    {
        command_queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &error);
    }
#endif
    CHECK_CL_ERROR(error);

    /* Create program */
    CreateAndBuildProgram(program, context, device, strdup(g_opencl_ctrl.fileName));

    /* Create kernels */
    kernel = clCreateKernel(program, g_opencl_ctrl.kernelName, &error);
    CHECK_CL_ERROR(error);

    error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &warpSize, NULL);
    CHECK_CL_ERROR(error);
    fprintf(stderr, "Preferred work group size: %lu\n", warpSize);

#if 0
    fprintf(stderr, "\nData before process:\n");
    switch (g_opencl_ctrl.dataType)
    {
        case TYPE_INT:
            {
                int *intptr = (int *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%d ", intptr[i]);
                fprintf(stderr, "\n");
            }
            break;
        case TYPE_FLOAT:
            {
                float *fltptr = (float *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%f ", fltptr[i]);
                fprintf(stderr, "\n");
            }
            break;
       case TYPE_DOUBLE:
            {
                double *dblptr = (double *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%lf ", dblptr[i]);
                fprintf(stderr, "\n");
            }
            break;
    }
#endif

    /* Create buffers */
    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, g_opencl_ctrl.dataByte, hostData, &error);
    CHECK_CL_ERROR(error);

    /* Execute kernels */
    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel, 1, sizeof(long), &g_opencl_ctrl.iteration);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel, 2, sizeof(int), &g_opencl_ctrl.interval);
    CHECK_CL_ERROR(error);

    start = PrintTimingInfo(fptr);

    globalSize[0] = g_opencl_ctrl.global_size;
    localSize[0] = g_opencl_ctrl.local_size;
    error = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalSize, localSize, 0, NULL, &event);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);

    end = PrintTimingInfo(fptr);
    fclose(fptr);

    error = clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, g_opencl_ctrl.dataByte, hostData, 0, NULL, NULL);
    CHECK_CL_ERROR(error);

#if 0
    fprintf(stderr, "\nData after process:\n");
    switch (g_opencl_ctrl.dataType)
    {
        case TYPE_INT:
            {
                int *intptr = (int *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%d ", intptr[i]);
                fprintf(stderr, "\n");
            }
            break;
        case TYPE_FLOAT:
            {
                float *fltptr = (float *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%f ", fltptr[i]);
                fprintf(stderr, "\n");
            }
            break;
       case TYPE_DOUBLE:
            {
                double *dblptr = (double *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%lf ", dblptr[i]);
                fprintf(stderr, "\n");
            }
            break;
    }
#endif

    /* Event profiling */
    error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL);
    CHECK_CL_ERROR(error);
    error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(endTime), &endTime, NULL);
    CHECK_CL_ERROR(error);
    fprintf(stderr, "\n['%s' execution time] %llu ns\n", g_opencl_ctrl.kernelName, (end - start) * 1000);
    fprintf(stdout, "%llu\n", (end - start) * 1000);

    /* Read the output */

    /* Release object */
    clReleaseKernel(kernel);
    clReleaseMemObject(buffer);
    clReleaseEvent(event);
    clReleaseProgram(program);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    free(hostData);

    return 0;
}
void CreateAndBuildProgram(cl_program &target_program, cl_context context, cl_device_id device, char *fileName)
{
    FILE *fptr;
    size_t programSize;
    char *programSource;
    cl_int error, binaryError;

    fptr = fopen(fileName, "r");
    if (fptr == NULL)
    {
        fprintf(stderr, "No such file: '%s'\n", fileName);
        exit(1);
    }

    /* Read program source */
    fseek(fptr, 0, SEEK_END);
    programSize = ftell(fptr);
    rewind(fptr);

    programSource = (char *)malloc(sizeof(char) * (programSize + 1));
    programSource[programSize] = '\0';
    fread(programSource, sizeof(unsigned char), programSize, fptr);
    fclose(fptr);

    /* Create and build cl_program object */
    target_program = clCreateProgramWithSource(context, 1, (const char **)(&programSource), &programSize, &error);
    CHECK_CL_ERROR(error);
    free(programSource);

    error = clBuildProgram(target_program, 1, &device, "-cl-opt-disable", NULL, NULL);
    if (error < 0)
    {
        size_t logSize;
        char *programBuildLog;

        error = clGetProgramBuildInfo(target_program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
        CHECK_CL_ERROR(error);
        programBuildLog = (char *)malloc(sizeof(char) * (logSize + 1));
        error = clGetProgramBuildInfo(target_program, device, CL_PROGRAM_BUILD_LOG, logSize + 1, programBuildLog, NULL);
        CHECK_CL_ERROR(error);

        fprintf(stderr, "%s\n", programBuildLog);
        free(programBuildLog);
        exit(1);
    }

#if 0
    {
        size_t binarySize;
        error = clGetProgramInfo(target_program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, NULL);
        CHECK_CL_ERROR(error);

        unsigned char *binary = (unsigned char *) malloc(sizeof(unsigned char) * binarySize);
        error = clGetProgramInfo(target_program, CL_PROGRAM_BINARIES, binarySize, &binary, NULL);
        CHECK_CL_ERROR(error);

        FILE *fptr = fopen(BINARY_FILE_NAME, "w");
        fprintf(fptr, "%s", binary);
        fclose(fptr);
    }
#endif

    free(fileName);
}
void GetPlatformAndDevice(cl_platform_id & target_platform, cl_device_id & target_device)
{
    cl_platform_id* platforms;
    cl_device_id* devices;
    cl_uint count;
    cl_int error;
    size_t length;

    char *queryString;

    /* Find platform */
    error = clGetPlatformIDs(0, NULL, &count);
    CHECK_CL_ERROR(error);

    platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * count);
    clGetPlatformIDs(count, platforms, NULL);

    if (g_opencl_ctrl.platform_id >= count)
        {fprintf(stderr, "Error: Cannot find selected platform\n"); exit(1);}
    target_platform = platforms[g_opencl_ctrl.platform_id];

    /* Find device */
    error = clGetDeviceIDs(target_platform, CL_DEVICE_TYPE_ALL, 0, NULL, &count);
    CHECK_CL_ERROR(error);

    devices = (cl_device_id *)malloc(sizeof(cl_device_id) * count);
    clGetDeviceIDs(target_platform, CL_DEVICE_TYPE_ALL, count, devices, NULL);

    if (g_opencl_ctrl.device_id >= count)
        {fprintf(stderr, "Error: Cannot find selected device\n"); exit(1);}
    target_device = devices[g_opencl_ctrl.device_id];

    /* Get device name */
    error = clGetDeviceInfo(target_device, CL_DEVICE_NAME, 0, NULL, &length);
    CHECK_CL_ERROR(error);

    queryString = (char *)malloc(sizeof(char) * length);
    clGetDeviceInfo(target_device, CL_DEVICE_NAME, length, queryString, NULL);
    fprintf(stderr, "Device selected: '%s'\n", queryString);

    {
        cl_uint vectorSize;
        size_t groupSize;
        size_t itemSize[3];

        error = clGetDeviceInfo(target_device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(vectorSize), &vectorSize, NULL);
        CHECK_CL_ERROR(error);
        fprintf(stderr, "Preferred char vector width : %u\n", vectorSize);

        error = clGetDeviceInfo(target_device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(vectorSize), &vectorSize, NULL);
        CHECK_CL_ERROR(error);
        fprintf(stderr, "Preferred int vector width : %u\n", vectorSize);

        error = clGetDeviceInfo(target_device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(vectorSize), &vectorSize, NULL);
        CHECK_CL_ERROR(error);
        fprintf(stderr, "Preferred float vector width : %u\n", vectorSize);

        error = clGetDeviceInfo(target_device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(vectorSize), &vectorSize, NULL);
        CHECK_CL_ERROR(error);
        fprintf(stderr, "Preferred double vector width : %u\n", vectorSize);

        error = clGetDeviceInfo(target_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
        CHECK_CL_ERROR(error);
        fprintf(stderr, "Maximum work group size : %lu\n", groupSize);

        error = clGetDeviceInfo(target_device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(itemSize), &itemSize, NULL);
        CHECK_CL_ERROR(error);
        fprintf(stderr, "Maximum work item size : %lu, %lu, %lu\n", itemSize[0], itemSize[1], itemSize[2]);
    }

    /* Free the space */
    free(platforms);
    free(devices);
    free(queryString);
}
Ejemplo n.º 23
0
int main (int argc, char *argv[])
{
  double *a, *a_reduced;

  if (argc != 3)
  {
    fprintf(stderr, "Usage: %s N nloops\n", argv[0]);
    abort();
  }

  const cl_long N = (cl_long) atol(argv[1]);
  const int nloops = atoi(argv[2]);

  cl_long Ngroups = (N + LDIM  - 1)/LDIM;
  Ngroups = (Ngroups + 8  - 1)/8;

  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);

  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("full_reduction.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "reduction",
      "-DLDIM=" STRINGIFY(LDIM));
  free(knl_text);

  // --------------------------------------------------------------------------
  // allocate and initialize CPU memory
  // --------------------------------------------------------------------------
  posix_memalign((void**)&a, 32, N*sizeof(double));
  if (!a) { fprintf(stderr, "alloc a"); abort(); }
  posix_memalign((void**)&a_reduced, 32, Ngroups*sizeof(double));
  if (!a_reduced) { fprintf(stderr, "alloc a_reduced"); abort(); }

  srand48(8);
  for(cl_long n = 0; n < N; ++n)
    a[n] = (double)drand48();
    // a[n] = n;

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N*sizeof(double),
      0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_a_reduced[2];
  buf_a_reduced[0] = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      Ngroups*sizeof(double), 0, &status);
  buf_a_reduced[1] = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      Ngroups*sizeof(double), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0,
        N*sizeof(double), a,
        0, NULL, NULL));

  timestamp_type tic, toc;
  double elapsed;

  // --------------------------------------------------------------------------
  // run reduction_simple on device
  // --------------------------------------------------------------------------

  printf("Simple Reduction\n");
  double sum_gpu = 0.0;
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&tic);
  for(int loop = 0; loop < nloops; ++loop)
  {
    int r = 0;
    size_t Ngroups_loop = Ngroups;
    SET_3_KERNEL_ARGS(knl, N, buf_a, buf_a_reduced[r]);

    size_t local_size[] = { LDIM };
    size_t global_size[] = { Ngroups_loop*LDIM };

    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 1, NULL,
          global_size, local_size, 0, NULL, NULL));

    while(Ngroups_loop > 1)
    {
      cl_long N_reduce = Ngroups_loop;
      Ngroups_loop = (N_reduce + LDIM  - 1)/LDIM;
      Ngroups_loop = (Ngroups_loop + 8  - 1)/8;

      size_t local_size[] = { LDIM };
      size_t global_size[] = { Ngroups_loop*LDIM };

      SET_3_KERNEL_ARGS(knl, N_reduce, buf_a_reduced[r], buf_a_reduced[(r+1)%2]);

      CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 1, NULL,
            global_size, local_size, 0, NULL, NULL));

      r = (r+1)%2;
    }

    CALL_CL_SAFE(clEnqueueReadBuffer(
          queue, buf_a_reduced[r], /*blocking*/ CL_TRUE, /*offset*/ 0,
          Ngroups_loop*sizeof(double), a_reduced, 0, NULL, NULL));

    sum_gpu = 0.0;
    for(cl_long n = 0; n < Ngroups_loop; ++n)
      sum_gpu += a_reduced[n];
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  elapsed = timestamp_diff_in_seconds(tic,toc)/nloops;
  printf("%f s\n", elapsed);
  printf("%f GB/s\n", N*sizeof(double)/1e9/elapsed);

  double sum_cpu = 0.0;
  for(cl_long n = 0; n < N; ++n)
    sum_cpu += a[n];

  printf("Sum CPU: %e\n", sum_cpu);

  printf("Sum GPU: %e\n", sum_gpu);

  printf("Relative Error: %e\n", fabs(sum_cpu-sum_gpu)/sum_gpu);

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_a));
  CALL_CL_SAFE(clReleaseMemObject(buf_a_reduced[0]));
  CALL_CL_SAFE(clReleaseMemObject(buf_a_reduced[1]));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));

  free(a);
  free(a_reduced);

  return 0;
}
Ejemplo n.º 24
0
int main(int argc, char **argv)
{
  cl_int err;
  const char *krn_src;
  cl_program empty, program;
  cl_context ctx;
  cl_device_id did;
  cl_command_queue queue;
  cl_uint num_krn;
  cl_kernel kernels[2];

  err = poclu_get_any_device(&ctx, &did, &queue);
  CHECK_OPENCL_ERROR_IN("poclu_get_any_device");
  TEST_ASSERT( ctx );
  TEST_ASSERT( did );
  TEST_ASSERT( queue );

  /* Test creating a program from an empty source */
  empty = clCreateProgramWithSource(ctx, 1, &empty_src, NULL, &err);
  CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource");
  err = clBuildProgram(empty, 0, NULL, NULL, NULL, NULL);
  CHECK_OPENCL_ERROR_IN("clBuildProgram");

  err = clCreateKernelsInProgram(empty, 0, NULL, &num_krn);
  CHECK_OPENCL_ERROR_IN("clCreateKernelsInProgram");
  TEST_ASSERT(num_krn == 0);

  krn_src = poclu_read_file(SRCDIR "/tests/runtime/test_clCreateKernelsInProgram.cl");
  TEST_ASSERT(krn_src);

  program = clCreateProgramWithSource(ctx, 1, &krn_src, NULL, &err);
  CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource");
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  CHECK_OPENCL_ERROR_IN("clBuildProgram");

  err = clCreateKernelsInProgram(program, 0, NULL, &num_krn);
  CHECK_OPENCL_ERROR_IN("clCreateKernelsInProgram");
  // test_clCreateKernelsInProgram.cl has two kernel functions.
  TEST_ASSERT(num_krn == 2);

  err = clCreateKernelsInProgram(program, 2, kernels, NULL);
  CHECK_OPENCL_ERROR_IN("clCreateKernelsInProgram");

  // make sure the kernels were actually created 
  // Note: nothing in the specification says which kernel function
  // is kernels[0], which is kernels[1]. For now assume pocl/LLVM
  // orders these deterministacally
  err = clEnqueueTask(queue, kernels[0], 0, NULL, NULL); 
  CHECK_OPENCL_ERROR_IN("clEnqueueTask");

  err = clFinish(queue);
  CHECK_OPENCL_ERROR_IN("clFinish");

  err = clEnqueueTask(queue, kernels[1], 0, NULL, NULL);
  CHECK_OPENCL_ERROR_IN("clEnqueueTask");

  err = clFinish(queue);
  CHECK_OPENCL_ERROR_IN("clFinish");

  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
  CHECK_CL_ERROR (clReleaseKernel (kernels[0]));
  CHECK_CL_ERROR (clReleaseKernel (kernels[1]));
  CHECK_CL_ERROR (clReleaseProgram (program));
  CHECK_CL_ERROR (clReleaseProgram (empty));
  CHECK_CL_ERROR (clReleaseContext (ctx));
  CHECK_CL_ERROR (clUnloadCompiler ());

  free ((void *)krn_src);

  return EXIT_SUCCESS;
}
int main(int argc, char *argv[])
{
    FILE* g_fptr;
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    cl_kernel kernel1, kernel2;
    cl_mem inputBufferA;
    cl_int error;
    size_t globalSize[2], localSize[2];

    struct timeval startTime, endTime;

    void* inputMatrixA = NULL;
    /* Parse options */
    CommandParser(argc, argv);

    g_fptr = fopen(g_opencl_ctrl.powerFile, "a");
    if (!g_fptr)
        exit(1);

    HostDataCreation(inputMatrixA);

    GetPlatformAndDevice(platform, device);

    /* Create context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
    CHECK_CL_ERROR(error);

    /* Create command queue */
    command_queue = clCreateCommandQueue(context, device, 0, &error);
    CHECK_CL_ERROR(error);

    /* Create program */
    CreateAndBuildProgram(program, context, device, strdup(CL_FILE_NAME));

    /* Create kernels */
    kernel1 = clCreateKernel(program, "Generate", &error);
    CHECK_CL_ERROR(error);
    kernel2 = clCreateKernel(program, "Access", &error);
    CHECK_CL_ERROR(error);

    /* Create buffers */
    inputBufferA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, g_opencl_ctrl.inputByteA, inputMatrixA, &error);
    CHECK_CL_ERROR(error);

    /* Execute kernels */
    error = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &inputBufferA);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel1, 1, sizeof(int), &g_opencl_ctrl.dataSizeW);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel1, 2, sizeof(int), &g_opencl_ctrl.dataSizeH);
    CHECK_CL_ERROR(error);

    error = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &inputBufferA);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel2, 1, sizeof(int), &g_opencl_ctrl.dataSizeW);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel2, 2, sizeof(int), &g_opencl_ctrl.iteration);
    CHECK_CL_ERROR(error);


    globalSize[0] = g_opencl_ctrl.dataSizeW;
    globalSize[1] = g_opencl_ctrl.dataSizeH;
    localSize[0] = g_opencl_ctrl.local_size1;
    localSize[1] = g_opencl_ctrl.local_size2;

    fprintf(stderr, "global size: %lu %lu\n", globalSize[0], globalSize[1]);
    fprintf(stderr, "local size: %lu %lu\n", localSize[0], localSize[1]);

    error = clEnqueueNDRangeKernel(command_queue, kernel1, 2, NULL, globalSize, localSize, 0, NULL, NULL);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);
 
    PrintTimingInfo(g_fptr);

    if (g_opencl_ctrl.timing)
        gettimeofday(&startTime, NULL);

    error = clEnqueueNDRangeKernel(command_queue, kernel2, 2, NULL, globalSize, localSize, 0, NULL, NULL);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);
    PrintTimingInfo(g_fptr);

    if (g_opencl_ctrl.timing)
        gettimeofday(&endTime, NULL);

    fclose(g_fptr);

    /* Read the output */
    error = clEnqueueReadBuffer(command_queue, inputBufferA, CL_TRUE, 0, g_opencl_ctrl.inputByteA, inputMatrixA, 0, NULL, NULL);
    CHECK_CL_ERROR(error);

    /* Release object */
    clReleaseKernel(kernel1);
    clReleaseKernel(kernel2);
    clReleaseMemObject(inputBufferA);
    clReleaseProgram(program);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    free(inputMatrixA);

    if (g_opencl_ctrl.timing)
    {
        unsigned long long start, end;
        start = startTime.tv_sec * 1000000 + startTime.tv_usec;
        end = endTime.tv_sec * 1000000 + endTime.tv_usec;

        fprintf(stderr, "Kernel execution time: %llu ms\n", (end - start) / 1000);
        fprintf(stdout, "%llu\n", (end - start) * 1000);
    }

    fprintf(stderr, "DONE.\n");

    return 0;
}
Ejemplo n.º 26
0
sampler* initialize_sampler(cl_int chain_length, cl_int dimension,
                            cl_int walkers_per_group, size_t work_group_size,
                            double a, cl_int pdf_number,
                            cl_int data_length, cl_float *data,
                            cl_int num_to_save, cl_int *indices_to_save,
                            const char *plat_name, const char *dev_name){

    /*
     Initialize stretch move MCMC sampler struct.
     Arrange parameters into sampler struct pointer.
     Allocate arrays on host, initialize walkers and other values as appropriate.
     Start OpenCL context and queue.
     Allocate device memory and transfer from host.
     Compile and initialize random number generator.
     Compile stretch move OpenCL kernel.

     Input:
          cl_int chain_length                Allocate space for this many samples in the sampler struct.
                                                 Sampler fills this array when run_sampler is called.
          cl_int dimension                   Dimension of state vector of Markov chain.
          cl_int walkers_per_group           Number of walkers in each of two groups. Total walkers is twice this.
          size_t work_group_size             Work group size.
                                                 For CPU this must be set to one.
                                                 For GPU this should be set larger, powers of two are optimal, try 64, 128 or 256.
                                                 This number must divide walkers_per_group.
          double a                           Coefficient for range of 'z' random variable.
                                                 Must be greater than one.
                                                 Standard value is 2.
                                                 Decrease a to increase low acceptance rate, especially in high dimensions.
          cl_int pdf_number                  Which PDF to sample. Passed to pdf.h as a compile time definition.
          cl_int data_length                 Length of observation data. If no data set this to zero.
          cl_float *data                     Observation data.
          cl_int num_to_save                 Number of components to save in the chain
          cl_int *indices_to_save            Indices of components to save in the chain
          const char *plat_name              String for platform name. Set to CHOOSE_INTERACTIVELY (no quotes) to do so.
          const char *dev_name               String for device name. Set to CHOOSE_INTERACTIVELY (no quotes) to do so.

     Output:
          returned: sampler *samp            Pointer to sampler struct with parameters, arrays, context, queue, kernel initialized.
     */


    if(OUTPUT_LEVEL > 0) printf("Initializing Stretch Move sampler.\n");


    // --------------------------------------------------------------------------
    // Set parameters
    // --------------------------------------------------------------------------

    // This environment variable forces headers to be reloaded each time
    // If not set and pdf if changed, changes may not be updated
    setenv("CUDA_CACHE_DISABLE", "1", 1);

    // allocate the structure for all the sampler parameters and arrays
    sampler * samp = (sampler *) malloc(sizeof(sampler));
    if(!samp) { perror("Allocation failure sampler"); abort(); }

    // user set parameters
    samp->M = chain_length;                           // Number of steps to run
    samp->N = dimension;                              // Dimension of the problem and the walkers
    samp->K_over_two = walkers_per_group ;            // Number of walkers in each group

    // derived parameters
    samp->K = 2 * samp->K_over_two;                   // Total walkers
    samp->total_samples = samp->M * samp->K;          // Total samples produced

    // indices to save
    samp->num_to_save = num_to_save;
    samp->indices_to_save_host = indices_to_save;

    // Allocate the structure and set values
    samp->data_st = (data_struct *) malloc(sizeof(data_struct));
    if(!(samp->data_st)) { perror("Allocation failure data_struct"); abort(); }

    // default value one, unless performing simulated annealing
    (samp->data_st)->beta         = 1.0f;
    (samp->data_st)->save         = 1;
    (samp->data_st)->num_to_save  = num_to_save;

    // coefficient on Z random variable
    samp->a = a;
    double a_coeffs[3];
    a_coeffs[0] = 1.0 / a;
    a_coeffs[1] = 2.0 * (1.0 - 1.0/a);
    a_coeffs[2] = a - 2.0 + 1.0/a;


    // error check on dimensions
    if(samp->K <= samp->N){
        fprintf(stderr, "Error: Must have more walkers than the dimension.\nExiting\n");
        abort();
    }

    // error check on work sizes
    if( (samp->K_over_two % work_group_size) != 0){
        fprintf(stderr, "Error: Number of walkers in each group must be multiple of work group size.\nExiting\n");
        abort();
    }

    // error check on dimensions to save
    for(int i=0; i<num_to_save; i++){
        if(samp->indices_to_save_host[i] >= samp->N){
            fprintf(stderr, "Error: Cannot save an index larger than the dimension of the problem.\nExiting\n");
            abort();
        }
    }

    if(a <= 1.0){
        fprintf(stderr, "Error: Value of a must be greater than one.\nDefaulting to 2.\n");
        samp->a = 2.0;
    }


    // for later output
    samp->acor_times  = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->acor_times) { perror("Allocation failure"); abort(); }
    samp->acor_pass   = (char   *) malloc(samp->num_to_save * sizeof(char));
    if(!samp->acor_pass) { perror("Allocation failure"); abort(); }
    samp->sigma       = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->sigma)      { perror("Allocation failure"); abort(); }
    samp->means       = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->means)      { perror("Allocation failure"); abort(); }
    samp->err_bar     = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->err_bar)    { perror("Allocation failure"); abort(); }

    // write parameter file for plotting
    write_parameter_file_matlab(samp->M, samp->N, samp->K, "Stretch Move",
                            samp->indices_to_save_host, samp->num_to_save, pdf_number);

    // --------------------------------------------------------------------------
    // Set up OpenCL context and queues
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Begin opencl contexts.\n");

    create_context_on(plat_name, dev_name, 0, &(samp->ctx), NULL, 0);

    {
      cl_int status;
      cl_device_id my_dev;

      CALL_CL_GUARDED(clGetContextInfo, (samp->ctx, CL_CONTEXT_DEVICES,
            sizeof(my_dev), &my_dev, NULL));

      samp->queue = clCreateCommandQueue(samp->ctx, my_dev, 0, &status);
      CHECK_CL_ERROR(status, "clCreateCommandQueue");
      samp->queue_mem = clCreateCommandQueue(samp->ctx, my_dev, 0, &status);
      CHECK_CL_ERROR(status, "clCreateCommandQueue");
    }

    // print information on selected device
    if(OUTPUT_LEVEL > 1)  print_device_info_from_queue(samp->queue);

    // set the work group sizes
    samp->ldim[0] = work_group_size;
    samp->gdim[0] = samp->K_over_two;

    if(OUTPUT_LEVEL > 0) printf("Context built.\n");


    // --------------------------------------------------------------------------
    // Start total timing
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Begin total timing.\n");
    get_timestamp(&(samp->time1_total));


    // --------------------------------------------------------------------------
    // Allocate host memory
    // --------------------------------------------------------------------------

    // counter for number of samples accepted
    samp->accepted_host = (cl_ulong *) malloc(samp->K_over_two * sizeof(cl_ulong));
    if(!(samp->accepted_host)){ perror("Allocation failure accepted host"); abort(); }
    for(int i=0; i< (samp->K_over_two); i++) samp->accepted_host[i] = 0;

    // Adjacent memory on x_red moves with in the walker
    // To access the ith component of walker j, take x_red[i + j*N];

    // red walkers
    samp->X_red_host = (cl_float *) malloc(samp->N * samp->K_over_two * sizeof(cl_float));
    if(!(samp->X_red_host)){ perror("Allocation failure X_red_host"); abort(); }

    // log likelihood
    samp->log_pdf_red_host = (cl_float *) malloc(samp->K_over_two * sizeof(cl_float));
    if(!(samp->log_pdf_red_host)){ perror("Allocation failure X_red_host"); abort(); }
    for(int i=0; i<(samp->K_over_two); i++) samp->log_pdf_red_host[i] = (-1.0f) / 0.0f;

    // black walkers
    samp->X_black_host = (cl_float *) malloc(samp->N * samp->K_over_two * sizeof(cl_float));
    if(!(samp->X_black_host)){ perror("Allocation failure X_black_host"); abort(); }

    // log likelihood
    samp->log_pdf_black_host = (cl_float *) malloc(samp->K_over_two * sizeof(cl_float));
    if(!(samp->log_pdf_black_host)){ perror("Allocation failure X_red_host"); abort(); }
    for(int i=0; i< (samp->K_over_two); i++) samp->log_pdf_black_host[i] = (-1.0f) / 0.0f;

    // samples on host
    cl_int samples_length = samp->num_to_save * samp->M * samp->K;                // length of the samples array
    samp->samples_host = (cl_float *) malloc(samples_length * sizeof(cl_float));         // samples to return
    if(!(samp->samples_host)){ perror("Allocation failure samples_host"); abort(); }


    // intialize the walkers to random values
    // set the seed value
    srand48(0);

    // initialize the walkers to small random values
    for(int j=0; j < samp->N * samp->K_over_two; j++){
        if(NONNEGATIVE_BOX){
            samp->X_black_host[j] = (cl_float) drand48();
            samp->X_red_host[j]   = (cl_float) drand48();
        }
        else{
            samp->X_black_host[j] = (cl_float) (0.1 * (drand48()-0.5));
            samp->X_red_host[j]   = (cl_float) (0.1 * (drand48()-0.5));
        }

    }


    // set up observations
    samp->data_length = data_length;

    // there are lots of complications that appear if this is empty
    // make it length one instead
    if(samp->data_length == 0){
        samp->data_length = 1;
        samp->data_host = (cl_float *) malloc(samp->data_length * sizeof(cl_float)) ;
        if(!(samp->data_host)){ perror("Allocation failure data_host"); abort(); }
        samp->data_host[0] = 0.0f;
    }
    else{
        // standard case
        samp->data_host = data;
    }


    // --------------------------------------------------------------------------
    // load kernels
    // --------------------------------------------------------------------------

    // stretch move kernel
    char *knl_text = read_file("stretch_move.cl");
    char options[300];
    sprintf(options, "-D NN=%d -D K_OVER_TWO=%d -D WORK_GROUP_SIZE=%d -D DATA_LEN=%d -D PDF_NUMBER=%d -D A_COEFF_0=%.10ff -D A_COEFF_1=%.10ff -D A_COEFF_2=%.10ff  -I . ",
            samp->N, samp->K_over_two, (int) work_group_size, samp->data_length, pdf_number, a_coeffs[0], a_coeffs[1], a_coeffs[2]);

    if(OUTPUT_LEVEL > 0) printf("Options string for stretch move kernel:%s\n", options);

    samp->stretch_knl = kernel_from_string(samp->ctx, knl_text, "stretch_move", options);
    free(knl_text);

    if(OUTPUT_LEVEL > 0) printf("Stretch Move kernel compiled.\n");

    // random number generator initialization
    char * knl_text_rand = read_file("Kernel_Ranluxcl_Init.cl");
    char options_rand_lux[100];

    if(AMD)
        sprintf(options_rand_lux, "-DRANLUXCL_LUX=4 -I .");
    else
        sprintf(options_rand_lux, "-DRANLUXCL_LUX=4");

    samp->init_rand_lux_knl = kernel_from_string(samp->ctx, knl_text_rand, "Kernel_Ranluxcl_Init", options_rand_lux);
    free(knl_text_rand);

    if(OUTPUT_LEVEL > 0) printf("Ranluxcl init kernel compiled.\n");



    // --------------------------------------------------------------------------
    // allocate device memory
    // --------------------------------------------------------------------------
    cl_int status;

    samp->X_red_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->N * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->log_pdf_red_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_red_save = clCreateBuffer(samp->ctx, CL_MEM_WRITE_ONLY,
      sizeof(cl_float) * samp->num_to_save * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_black_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->N * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->log_pdf_black_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_black_save = clCreateBuffer(samp->ctx, CL_MEM_WRITE_ONLY,
      sizeof(cl_float) * samp->num_to_save * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->accepted_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
            samp->K_over_two * sizeof(cl_ulong), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->indices_to_save_device = clCreateBuffer(samp->ctx, CL_MEM_READ_ONLY,
            samp->num_to_save * sizeof(cl_int), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // allocate for the observations
    samp->data_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->data_length, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    // data struct on device
    samp->data_st_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
            sizeof(data_struct), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // allocate for the state array for randluxcl
    // use a 1d work group
    size_t rand_lux_state_buffer_size = samp->gdim[0] * 7 * sizeof(cl_float4);
    samp->ranluxcltab = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
        rand_lux_state_buffer_size, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // --------------------------------------------------------------------------
    // transfer to device
    // --------------------------------------------------------------------------

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->X_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->N * samp->K_over_two * sizeof(cl_float), samp->X_red_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->log_pdf_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_float), samp->log_pdf_red_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->X_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->N * samp->K_over_two * sizeof(cl_float), samp->X_black_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->log_pdf_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_float), samp->log_pdf_black_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->data_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->data_length * sizeof(cl_float), samp->data_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        sizeof(data_struct), samp->data_st,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->indices_to_save_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->num_to_save * sizeof(cl_int), samp->indices_to_save_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));


    // --------------------------------------------------------------------------
    // Initialize random number generator
    // --------------------------------------------------------------------------

    // int for state variable initialization
    cl_int ins = 1;
    SET_2_KERNEL_ARGS(samp->init_rand_lux_knl, ins, samp->ranluxcltab);

    CALL_CL_GUARDED(clEnqueueNDRangeKernel,
          (samp->queue, samp->init_rand_lux_knl,
           /*dimensions*/ 1, NULL, samp->gdim, samp->ldim,
           0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));

    // --------------------------------------------------------------------------
    // Sampler initialization is done
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Sampler initialized.\n");
    return samp;
}
Ejemplo n.º 27
0
int
main (int argv, char **argc)
{
  /////////////////////////
  ////// SAME IN EVERY FILE
  /////////////////////////

  // create context and command queue
  cl_context       __sheets_context;
  cl_command_queue __sheets_queue;
  int              _i;
  cl_int           __cl_err;
  
  create_context_on(SHEETS_PLAT_NAME,
		    SHEETS_DEV_NAME,
		    0,		/* choose the first (only) available device */
		    &__sheets_context,
		    &__sheets_queue,
		    0);

  // compile kernels
  for (_i = 0; _i < NKERNELS; _i++) {
    compiled_kernels[_i] = kernel_from_string(__sheets_context,
					      kernel_strings[_i],
					      kernel_names[_i],
					      SHEETS_KERNEL_COMPILE_OPTS);
  }

  ////// [END]

  size_t __SIZE_wav = atoi(argc[1]);

  float wav[__SIZE_wav];
  const char *file_name = "mytune.wav";
  int in_thrsh_cnt = 0;

  timestamp_type st;
  timestamp_type end;

  get_timestamp(&st);		
  for (_i = 0; _i < __SIZE_wav; _i++) {
    wav[_i] = (float) rand() / RAND_MAX;
    if (in_thrsh(wav[_i], 0.1112, 0.7888))
      in_thrsh_cnt++;
  }
  get_timestamp(&end);

  printf("cpu execution took %f seconds\n", timestamp_diff_in_seconds(st, end));

  get_timestamp(&st);

  /////////////////
  ////// GFUNC CALL
  /////////////////

  /// create variables for function arguments given as literals
  float __PRIM_band_restrict_ARG2 = 0.1112f;
  float __PRIM_band_restrict_ARG3 = 0.7888f;

  /// return array (always arg0)
  cl_mem __CLMEM_band_restrict_ARG0 = clCreateBuffer(__sheets_context, 
						     CL_MEM_WRITE_ONLY, 
						     sizeof(float) * __SIZE_wav, 
						     NULL, 
						     &__cl_err);
  CHECK_CL_ERROR(__cl_err, "clCreateBuffer");
					      
  /// input arrays
  cl_mem __CLMEM_band_restrict_ARG1 = clCreateBuffer(__sheets_context, 
						     CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
						     sizeof(float) * __SIZE_wav, 
						     (void *) wav, 
						     &__cl_err);
  CHECK_CL_ERROR(__cl_err, "clCreateBuffer");

  /// write to device memory
  CALL_CL_GUARDED(clEnqueueWriteBuffer,
		 (__sheets_queue,
		  __CLMEM_band_restrict_ARG1,
		  CL_TRUE,	/* blocking write */
		  0, 		/* no offset */
		  sizeof(float) * __SIZE_wav,
		  wav,
		  0,		/* no wait list */
		  NULL,
		  NULL)
		  );
  
  /// set up kernel arguments
  SET_4_KERNEL_ARGS(compiled_kernels[0],
		    __CLMEM_band_restrict_ARG0,
		    __CLMEM_band_restrict_ARG1,
		    __PRIM_band_restrict_ARG2,
		    __PRIM_band_restrict_ARG3);

  /// enqueue kernel
  cl_event __CLEVENT_band_restrict_CALL;
  CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		  (__sheets_queue,
		   compiled_kernels[0],
		   1,		/* 1 dimension */
		   0,		/* 0 offset */
		   &__SIZE_wav,
		   NULL,	/* let OpenCL break things up */
		   0,		/* no events in wait list */
		   NULL,	/* empty wait list */
		   &__CLEVENT_band_restrict_CALL)
		  );

  /// allocate space for cpu return array
  float out[__SIZE_wav];
  
  CALL_CL_GUARDED(clEnqueueReadBuffer,
		  (__sheets_queue,
		   __CLMEM_band_restrict_ARG0,
		   CL_TRUE,	 /* blocking read */
		   0,		 /* 0 offset */
		   sizeof(float) * __SIZE_wav, 	 /* read whole buffer */
		   (void *) out, /* host pointer */
		   1,		 /* wait for gfunc to finish */
		   &__CLEVENT_band_restrict_CALL, /* "" */
		   NULL)			  /* no need to wait for this call though */
		  );
  
  ////// [END] GFUNC CALL

  get_timestamp(&end);

  printf("gfunc call took %f seconds\n", timestamp_diff_in_seconds(st, end));

  ////// Validate call
  int c = 0;

  for (_i = 0; _i < __SIZE_wav; _i++) {
    if (in_thrsh(out[_i], 0.1112, 0.7888)) {
      c++;
    } else if(out[_i]) {
      exit(1);
    }
  }

  printf("\n");
	 
  assert(in_thrsh_cnt == c);

  //////////////
  ////// CLEANUP
  //////////////

  CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG0));
  CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG1));
  for (_i = 0; _i < NKERNELS; _i++) {
    CALL_CL_GUARDED(clReleaseKernel, (compiled_kernels[_i]));
  }
  CALL_CL_GUARDED(clReleaseCommandQueue, (__sheets_queue));
  CALL_CL_GUARDED(clReleaseContext, (__sheets_context));

  return 0;
}
Ejemplo n.º 28
0
int main(int argc, char **argv)
{
  if (argc != 3)
  {
    fprintf(stderr, "need two arguments!\n");
    abort();
  }

  const long n = atol(argv[1]);
  const long size = n*n;
  const int ntrips = atoi(argv[2]);

  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);

  cl_int status;

  // --------------------------------------------------------------------------
  // load kernels 
  // --------------------------------------------------------------------------
  char *knl_text = read_file("transpose-soln.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "transpose", NULL);
  free(knl_text);

  // --------------------------------------------------------------------------
  // allocate and initialize CPU memory
  // --------------------------------------------------------------------------
#ifdef USE_PINNED
  cl_mem buf_a_host = clCreateBuffer(ctx,
      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  cl_mem buf_b_host = clCreateBuffer(ctx,
      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  value_type *a = (value_type *) clEnqueueMapBuffer(queue, buf_a_host,
      /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 
      /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status);
  CHECK_CL_ERROR(status, "clEnqueueMapBuffer");
  value_type *b = (value_type *) clEnqueueMapBuffer(queue, buf_b_host,
      /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 
      /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status);
  CHECK_CL_ERROR(status, "clEnqueueMapBuffer");

#else
  value_type *a = (value_type *) malloc(sizeof(value_type) * size);
  if (!a) { perror("alloc x"); abort(); }
  value_type *b = (value_type *) malloc(sizeof(value_type) * size);
  if (!b) { perror("alloc y"); abort(); }
#endif

  for (size_t j = 0; j < n; ++j)
    for (size_t i = 0; i < n; ++i)
      a[i + j*n] = i + j*n;

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
  CALL_CL_GUARDED(clFinish, (queue));

  timestamp_type time1, time2;
  get_timestamp(&time1);

  CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        queue, buf_a, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), a,
        0, NULL, NULL));

  CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), b,
        0, NULL, NULL));

  get_timestamp(&time2);
  double elapsed = timestamp_diff_in_seconds(time1,time2);
  printf("transfer: %f s\n", elapsed);
  printf("transfer: %f GB/s\n",
      2*size*sizeof(value_type)/1e9/elapsed);


  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  CALL_CL_GUARDED(clFinish, (queue));

  get_timestamp(&time1);

  for (int trip = 0; trip < ntrips; ++trip)
  {
    SET_3_KERNEL_ARGS(knl, buf_a, buf_b, n);
    size_t ldim[] = { 16, 16 };
    size_t gdim[] = { n, n };
    CALL_CL_GUARDED(clEnqueueNDRangeKernel,
        (queue, knl,
         /*dimensions*/ 2, NULL, gdim, ldim,
         0, NULL, NULL));
  }

  CALL_CL_GUARDED(clFinish, (queue));

  get_timestamp(&time2);
  elapsed = timestamp_diff_in_seconds(time1,time2)/ntrips;
  printf("%f s\n", elapsed);
  printf("%f GB/s\n",
      2*size*sizeof(value_type)/1e9/elapsed);

  CALL_CL_GUARDED(clEnqueueReadBuffer, (
        queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), b,
        0, NULL, NULL));

  CALL_CL_GUARDED(clFinish, (queue));

  for (size_t i = 0; i < n; ++i)
    for (size_t j = 0; j < n; ++j)
      if (a[i + j*n] != b[j + i*n])
      {
        printf("bad %d %d\n", i, j);
        abort();
      }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_GUARDED(clFinish, (queue));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_a));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_b));
  CALL_CL_GUARDED(clReleaseKernel, (knl));
  CALL_CL_GUARDED(clReleaseCommandQueue, (queue));
  CALL_CL_GUARDED(clReleaseContext, (ctx));

#ifdef USE_PINNED
  CALL_CL_GUARDED(clReleaseMemObject, (buf_a_host));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_b_host));
#else
  free(a);
  free(b);
#endif
  return 0;
}
Ejemplo n.º 29
0
int
main(void){
  cl_int err;
  cl_platform_id platforms[MAX_PLATFORMS];
  cl_uint nplatforms;
  cl_device_id devices[MAX_DEVICES + 1]; // + 1 for duplicate test
  cl_device_id device_id0;
  cl_uint num_devices;
  size_t i;
  size_t num_binaries;
  const unsigned char **binaries = NULL;
  size_t *binary_sizes = NULL;
  size_t num_bytes_copied;
  cl_int binary_statuses[MAX_BINARIES];
  cl_int binary_statuses2[MAX_BINARIES];
  cl_program program = NULL;
  cl_program program_with_binary = NULL;

  err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms);
  CHECK_OPENCL_ERROR_IN("clGetPlatformIDs");
  if (!nplatforms)
    return EXIT_FAILURE;
  
  err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, MAX_DEVICES,
                      devices, &num_devices);
  CHECK_OPENCL_ERROR_IN("clGetDeviceIDs");

  cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
  CHECK_OPENCL_ERROR_IN("clCreateContext");

  size_t kernel_size = strlen(kernel);
  char* kernel_buffer = kernel;

  program = clCreateProgramWithSource(context, 1, (const char**)&kernel_buffer, 
				      &kernel_size, &err);
  CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource");

  err = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL);
  CHECK_OPENCL_ERROR_IN("clBuildProgram");
  
  err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, 0, 0, &num_binaries);
  CHECK_OPENCL_ERROR_IN("clGetProgramInfo");

  num_binaries = num_binaries/sizeof(size_t);
  binary_sizes = (size_t*)malloc(num_binaries * sizeof(size_t));
  binaries = (const unsigned char**)calloc(num_binaries, sizeof(unsigned char*));

  err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, 
			 num_binaries*sizeof(size_t), binary_sizes , 
			 &num_bytes_copied);
  CHECK_OPENCL_ERROR_IN("clGetProgramInfo");
  
  for (i = 0; i < num_binaries; ++i) 
    binaries[i] = (const unsigned char*) malloc(binary_sizes[i] *
						sizeof(const unsigned char));

  err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 
			 num_binaries*sizeof(char*), binaries, &num_bytes_copied);
  CHECK_OPENCL_ERROR_IN("clGetProgramInfo");
  
  cl_uint num = num_binaries < num_devices ? num_binaries : num_devices;
  if (num == 0)
    {
      err = !CL_SUCCESS;
      goto FREE_AND_EXIT;
    }
  
  program_with_binary = clCreateProgramWithBinary(context, num, devices, binary_sizes, 
						  binaries, binary_statuses, &err);
  CHECK_OPENCL_ERROR_IN("clCreateProgramWithBinary");

  for (i = 0; i < num; ++i) {
      cl_program_binary_type bin_type = 0;
      err = clGetProgramBuildInfo(program_with_binary, devices[i],
                                  CL_PROGRAM_BINARY_TYPE,
                                  sizeof(bin_type), (void *)&bin_type,
                                  NULL);
      CHECK_OPENCL_ERROR_IN("get program binary type");

      /* cl_program_binary_type */
      switch(bin_type) {
        case CL_PROGRAM_BINARY_TYPE_NONE: /*0x0*/
          fprintf(stderr, "program binary type: CL_PROGRAM_BINARY_TYPE_NONE\n");
        break;
        case CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT: /*0x1*/
          fprintf(stderr, "program binary type: CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT\n");
        break;
        case CL_PROGRAM_BINARY_TYPE_LIBRARY: /*0x2*/
          fprintf(stderr, "program binary type: CL_PROGRAM_BINARY_TYPE_LIBRARY\n");
        break;
        case CL_PROGRAM_BINARY_TYPE_EXECUTABLE: /*0x4*/
          fprintf(stderr, "program binary type: CL_PROGRAM_BINARY_TYPE_EXECUTABLE\n");
         break;
      }
  }
  err = clReleaseProgram(program_with_binary);
  CHECK_OPENCL_ERROR_IN("clReleaseProgram");

  for (i = 0; i < num; i++)
    {
      if (binary_statuses[i] != CL_SUCCESS)
        {
          err = !CL_SUCCESS;
          goto FREE_AND_EXIT;
        }
    }
    
  // negative test1: invalid device
  device_id0 = devices[0];
  devices[0] = NULL; // invalid device
  program_with_binary = clCreateProgramWithBinary(context, num, devices, binary_sizes, 
						  binaries, binary_statuses, &err);

  if (err != CL_INVALID_DEVICE || program_with_binary != NULL)
    {
      err = !CL_SUCCESS;
      goto FREE_AND_EXIT;
    }
  err = CL_SUCCESS;

  devices[0] = device_id0;
  for (i = 0; i < num_binaries; ++i) free((void*)binaries[i]);
  free(binary_sizes);
  free(binaries);
  
  // negative test2: duplicate device
  num_binaries = 2;
  devices[1] = devices[0]; // duplicate
  
  binary_sizes = (size_t*)malloc(num_binaries * sizeof(size_t));
  binaries = (const unsigned char**)calloc(num_binaries, sizeof(unsigned char*));
  
  err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, 1*sizeof(size_t), 
			 binary_sizes , &num_bytes_copied);
  CHECK_OPENCL_ERROR_IN("clGetProgramInfo");
  
  binary_sizes[1] = binary_sizes[0];
  
  binaries[0] = (const unsigned char*) malloc(binary_sizes[0] *
					      sizeof(const unsigned char));
  binaries[1] = (const unsigned char*) malloc(binary_sizes[1] *
					      sizeof(const unsigned char));
  
  err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 1 * sizeof(char*), 
			 binaries, &num_bytes_copied);
  CHECK_OPENCL_ERROR_IN("clGetProgramInfo");
  
  memcpy((void*)binaries[1], (void*)binaries[0], binary_sizes[0]);      
  program_with_binary = clCreateProgramWithBinary(context, 2, devices, binary_sizes, 
						  binaries, binary_statuses2, &err);
  if (err != CL_INVALID_DEVICE || program_with_binary != NULL)
    {
      err = !CL_SUCCESS;
      goto FREE_AND_EXIT;
    }
  err = CL_SUCCESS;

 FREE_AND_EXIT:  
  // Free resources
  for (i = 0; i < num_binaries; ++i) 
    if (binaries) 
      if(binaries[i]) 
	free((void*)binaries[i]);

  if (binary_sizes) 
    free(binary_sizes);
  if (binaries) 
    free(binaries);
  if (program)
    CHECK_CL_ERROR (clReleaseProgram (program));
  if (program_with_binary)
    CHECK_CL_ERROR (clReleaseProgram (program_with_binary));
  if (context)
    CHECK_CL_ERROR (clReleaseContext (context));

  CHECK_CL_ERROR (clUnloadCompiler ());

  return err == CL_SUCCESS ? EXIT_SUCCESS : EXIT_FAILURE;
}
Ejemplo n.º 30
0
int main(int argc, char **argv) {
  unsigned int n = 100;
  
  double *h_a;
  double *h_b;
  double *h_c;
  cl_mem mem_list[3];
  const void *args_mem_loc[3];

  struct native_kernel_args args;
 
  cl_mem d_a;
  cl_mem d_b;
  cl_mem d_c;
  
  cl_context ctx;
  cl_device_id did;
  cl_command_queue queue;
 
  size_t bytes = n * sizeof(double);
 
  h_a = (double *) malloc(bytes);
  h_b = (double *) malloc(bytes);
  h_c = (double *) malloc(bytes);
 
  size_t i;
  for( i = 0; i < n; i++ )
  {
    h_a[i] = (double)i;
    h_b[i] = (double)i;
  }

  cl_int err;

  CHECK_CL_ERROR(poclu_get_any_device(&ctx, &did, &queue));
  TEST_ASSERT( ctx );
  TEST_ASSERT( did );
  TEST_ASSERT( queue );

  d_a = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_a, &err);
  CHECK_OPENCL_ERROR_IN("clCreateBuffer");
  TEST_ASSERT(d_a);

  d_b = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_b, &err);
  CHECK_OPENCL_ERROR_IN("clCreateBuffer");
  TEST_ASSERT(d_b);

  d_c = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, bytes, NULL, &err);
  CHECK_OPENCL_ERROR_IN("clCreateBuffer");
  TEST_ASSERT(d_c);

  args.size = n;
  args.a = 0;
  args.b = 0;
  args.c = 0;

  mem_list[0] = d_a;
  mem_list[1] = d_b;
  mem_list[2] = d_c;

  args_mem_loc[0] = &args.a;
  args_mem_loc[1] = &args.b;
  args_mem_loc[2] = &args.c;
  
  err = clEnqueueNativeKernel ( queue, native_vec_add, &args, sizeof(struct native_kernel_args),
          3, mem_list, args_mem_loc, 0, NULL, NULL);
  CHECK_OPENCL_ERROR_IN("clEnqueueNativeKernel");
 
  err = clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL );
  CHECK_OPENCL_ERROR_IN("clEnqueueReadBuffer");

  err = clFinish(queue);
  CHECK_OPENCL_ERROR_IN("clFinish");

  for(i = 0; i < n; i++)
    if(h_c[i] != 2 * i)
      {
        printf("Fail to validate vector\n");
        goto error;
      }

  CHECK_CL_ERROR (clReleaseMemObject (d_a));
  CHECK_CL_ERROR (clReleaseMemObject (d_b));
  CHECK_CL_ERROR (clReleaseMemObject (d_c));
  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
  CHECK_CL_ERROR (clReleaseContext (ctx));

  free(h_a);
  free(h_b);
  free(h_c);

  return EXIT_SUCCESS;

error:
  return EXIT_FAILURE;
}