Example #1
0
/*! \brief Creates cl_command_queue for this Queue.
  *
  * This is only needed when the Queue is not instantiated with a Device and a Queue.
  *
  * \param ctxt Context for which this Queue will run. If not specified the set context will be taken.
  */
void ocl::Queue::create(ocl::Context * ctxt)
{
	if(ctxt == 0){
		if(_context == nullptr) throw std::runtime_error("this queue must have a valid context");
	}
	else {
		if(_context != ctxt && _context != nullptr) throw std::runtime_error("cannot have different contexts for the same program");
		_context = ctxt;
	}

	cl_int status;
	
#if CL_VERSION_2_0
  if ( supportsAtLeast2Point0( device().platform() ) )
  {
    cl_queue_properties propties[] = {
      CL_QUEUE_PROPERTIES, this->properties(),
      0
    };
    
    _id = clCreateCommandQueueWithProperties( this->context().id(), this->device().id(), propties, &status );
  }
  else
#endif
  {
    _id = clCreateCommandQueue(this->context().id(), this->device().id(), this->properties(), &status);
  }
  
	OPENCL_SAFE_CALL(status);
	if(_id == nullptr) throw std::runtime_error("could not create command queue");
    _context->insert(this);
}
Example #2
0
EasyOpenCL<T>::EasyOpenCL(bool printData) {

  info = printData;
  cl_uint numPlatforms;           //the NO. of platforms

  // Fetch the different platforms on which we can run our kernel
  cl_platform_id platform = NULL;
  status = clGetPlatformIDs(0, NULL, &numPlatforms);
  checkError("clGetPlatformIDs");

  // Take the first platform available
  if (numPlatforms > 0)
  {
    cl_platform_id* platforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id));
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    platform = platforms[0];
    free(platforms);
  }

  // Get the devices which are available on said platform
  cl_uint numDevices = 0;
  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);

  if (numDevices)
  {
    //Use the first GPU available
    devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
  }
  else
  {
    // If there is no GPU support, fall back to the CPU

    if(info) {
      std::cout << "No supported GPU device available." << std::endl;
      std::cout << "Falling back to using the CPU." << std::endl;
      std::cout << std::endl;
    }

    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);
    devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL);
  }

  //Print the data of the selected device
  if (info) {
    printDeviceProperty(*devices);
  }

  //Create an OpenCL context and a command queue
  context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
  checkError("clCreateContext");

  commandQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
  checkError("clCreateCommandQueueWithProperties");
}
EXTERN_C_ENTER

JNIEXPORT jlong JNICALL Java_org_lwjgl_opencl_CL20_nclCreateCommandQueueWithProperties(JNIEnv *__env, jclass clazz, jlong contextAddress, jlong deviceAddress, jlong propertiesAddress, jlong errcode_retAddress, jlong __functionAddress) {
	cl_context context = (cl_context)(intptr_t)contextAddress;
	cl_device_id device = (cl_device_id)(intptr_t)deviceAddress;
	const cl_command_queue_properties *properties = (const cl_command_queue_properties *)(intptr_t)propertiesAddress;
	cl_int *errcode_ret = (cl_int *)(intptr_t)errcode_retAddress;
	clCreateCommandQueueWithPropertiesPROC clCreateCommandQueueWithProperties = (clCreateCommandQueueWithPropertiesPROC)(intptr_t)__functionAddress;
	UNUSED_PARAMS(__env, clazz)
	return (jlong)(intptr_t)clCreateCommandQueueWithProperties(context, device, properties, errcode_ret);
}
Example #4
0
static
cl_command_queue
skc_runtime_cl_12_create_cq(struct skc_runtime * const runtime,
                            struct skc_cq_pool * const pool)

{
  cl_command_queue cq;

#if 1
      //
      // <= OpenCL 1.2
      //
      cl_int cl_err;

      cq = clCreateCommandQueue(runtime->cl.context,
                                runtime->cl.device_id,
                                pool->cq_props,
                                &cl_err); cl_ok(cl_err);
#else
  if (runtime_cl->version.major < 2)
    {
      //
      // <= OpenCL 1.2
      //
      cl_int cl_err;

      cq = clCreateCommandQueue(runtime_cl->context,
                                runtime_cl->device_id,
                                (cl_command_queue_properties)type,
                                &cl_err); cl_ok(cl_err);
    }
  else
    {
      //
      // >= OpenCL 2.0
      //
      cl_int                    cl_err;
      cl_queue_properties const queue_properties[] = {
        CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
      };

      cq = clCreateCommandQueueWithProperties(runtime_cl->context,
                                              runtime_cl->device_id,
                                              queue_properties,
                                              &cl_err); cl_ok(cl_err);
    }
#endif

  return cq;
}
void LSHReservoirSampler::clCommandQueue() {
	// Create command queue.Properties(2): CL_QUEUE_PROFILING_ENABLE, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE. 
#ifdef OPENCL_2XX
	command_queue_gpu = clCreateCommandQueueWithProperties(context_gpu, devices_gpu[CL_DEVICE_ID], NULL, &_err);
	clCheckError(_err, "[OpenCL] Couldn't create command queue for GPU.");
	//command_queue_cpu = clCreateCommandQueueWithProperties(context_cpu, devices_cpu[CL_CPU_DEVICE], NULL, &_err);
	//clCheckError(_err, "[OpenCL] Couldn't create command queue for CPU.");
#else
	command_queue_gpu = clCreateCommandQueue(context_gpu, devices_gpu[CL_DEVICE_ID], NULL, &_err);
	clCheckError(_err, "[OpenCL] Couldn't create command queue for GPU.");
	//command_queue_cpu = clCreateCommandQueue(context_cpu, devices_cpu[CL_CPU_DEVICE], NULL, &_err);
	//clCheckError(_err, "[OpenCL] Couldn't create command queue for CPU.");
#endif
}
Example #6
0
OpenCLFramework<T>::OpenCLFramework(bool printData) {

	info = printData;
	cl_uint numPlatforms;	//the NO. of platforms
	cl_platform_id platform = NULL;	//the chosen platform
	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	checkError("clGetPlatformIDs");

	//Just take the first platform available
	if (numPlatforms > 0)
	{
		cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
		platform = platforms[0];
		free(platforms);
	}

	//Try to get the GPU, if not available, take the CPU
	cl_uint	numDevices = 0;
	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	checkError("clGetDeviceIDs");

	if (numDevices == 0)	//no GPU available.
	{
		std::cout << "No GPU device available." << std::endl;
		std::cout << "Choose CPU as default device." << std::endl;
		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);
		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL);
	}
	else
	{
		//Pick the GPU
		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
	}

	//Print the data about the picked device
	if (info) {
		printDeviceProperty(*devices);
	}

	//Create an OpenCL context and a command queue
	context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
	checkError("clCreateContext");

	commandQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
	checkError("clCreateCommandQueueWithProperties");
}
Example #7
0
cl_int
set_kernel(int did,
           cl_prop *prop) {
  cl_int status;

  prop->context = clCreateContext(0, prop->num_devices,
      (const cl_device_id *)prop->devices, NULL, NULL, &status);

  prop->queue = clCreateCommandQueueWithProperties(prop->context,
      prop->devices[did], 0, &status);

  prop->program = clCreateProgramWithSource(prop->context,
      prop->kcode.count, (const char **)prop->kcode.codes, NULL, &status);

  const char *options = "-I./include";
  status = clBuildProgram(prop->program, prop->num_devices,
      (const cl_device_id *)prop->devices, options, NULL, NULL);

  if(status != CL_SUCCESS) {
    printf("%s[Build Error Log]%s\n", ERR_STR, CLR_STR);
  } else {
    printf("%s[Build Log]%s\n", WHT_STR, CLR_STR);
  }
  print_build_log(did, prop);
  if(status != CL_SUCCESS) getchar();

  prop->gabor   =
    clCreateKernel(prop->program, (const char *)"enable_gabor",   NULL);
  prop->pooling =
    clCreateKernel(prop->program, (const char *)"enable_pooling", NULL);
  prop->feature =
    clCreateKernel(prop->program, (const char *)"feature_rfcn",   NULL);
  prop->cls     =
    clCreateKernel(prop->program, (const char *)"class_rfcn",     NULL);

  return status;
}
int main(int argc, char *argv[])
{
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem buffer;
    cl_int error;
    cl_event event;
    cl_ulong startTime, endTime;
    size_t globalSize[1], localSize[1], warpSize;
    FILE* fptr;
    unsigned long long start, end;

    void* hostData = NULL;

    /* Parse options */
    CommandParser(argc, argv);
    HostDataCreation(hostData);

    GetPlatformAndDevice(platform, device);
    fptr = fopen(g_opencl_ctrl.powerFile, "a");

    /* Create context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
    CHECK_CL_ERROR(error);

    /* Create command queue */
#ifdef USE_CL_2_0_API
    {
        cl_queue_properties property[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
        command_queue = clCreateCommandQueueWithProperties(context, device, property, &error);
    }
#else
    {
        command_queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &error);
    }
#endif
    CHECK_CL_ERROR(error);

    /* Create program */
    CreateAndBuildProgram(program, context, device, strdup(g_opencl_ctrl.fileName));

    /* Create kernels */
    kernel = clCreateKernel(program, g_opencl_ctrl.kernelName, &error);
    CHECK_CL_ERROR(error);

    error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &warpSize, NULL);
    CHECK_CL_ERROR(error);
    fprintf(stderr, "Preferred work group size: %lu\n", warpSize);

#if 0
    fprintf(stderr, "\nData before process:\n");
    switch (g_opencl_ctrl.dataType)
    {
        case TYPE_INT:
            {
                int *intptr = (int *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%d ", intptr[i]);
                fprintf(stderr, "\n");
            }
            break;
        case TYPE_FLOAT:
            {
                float *fltptr = (float *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%f ", fltptr[i]);
                fprintf(stderr, "\n");
            }
            break;
       case TYPE_DOUBLE:
            {
                double *dblptr = (double *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%lf ", dblptr[i]);
                fprintf(stderr, "\n");
            }
            break;
    }
#endif

    /* Create buffers */
    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, g_opencl_ctrl.dataByte, hostData, &error);
    CHECK_CL_ERROR(error);

    /* Execute kernels */
    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel, 1, sizeof(long), &g_opencl_ctrl.iteration);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel, 2, sizeof(int), &g_opencl_ctrl.interval);
    CHECK_CL_ERROR(error);

    start = PrintTimingInfo(fptr);

    globalSize[0] = g_opencl_ctrl.global_size;
    localSize[0] = g_opencl_ctrl.local_size;
    error = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalSize, localSize, 0, NULL, &event);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);

    end = PrintTimingInfo(fptr);
    fclose(fptr);

    error = clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, g_opencl_ctrl.dataByte, hostData, 0, NULL, NULL);
    CHECK_CL_ERROR(error);

#if 0
    fprintf(stderr, "\nData after process:\n");
    switch (g_opencl_ctrl.dataType)
    {
        case TYPE_INT:
            {
                int *intptr = (int *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%d ", intptr[i]);
                fprintf(stderr, "\n");
            }
            break;
        case TYPE_FLOAT:
            {
                float *fltptr = (float *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%f ", fltptr[i]);
                fprintf(stderr, "\n");
            }
            break;
       case TYPE_DOUBLE:
            {
                double *dblptr = (double *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%lf ", dblptr[i]);
                fprintf(stderr, "\n");
            }
            break;
    }
#endif

    /* Event profiling */
    error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL);
    CHECK_CL_ERROR(error);
    error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(endTime), &endTime, NULL);
    CHECK_CL_ERROR(error);
    fprintf(stderr, "\n['%s' execution time] %llu ns\n", g_opencl_ctrl.kernelName, (end - start) * 1000);
    fprintf(stdout, "%llu\n", (end - start) * 1000);

    /* Read the output */

    /* Release object */
    clReleaseKernel(kernel);
    clReleaseMemObject(buffer);
    clReleaseEvent(event);
    clReleaseProgram(program);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    free(hostData);

    return 0;
}
Example #9
0
/**
 * initialize OpenCL device
 */
int cl_init(int num_values, mvalue_ptr *values, int num_members, member *members, int metric_type)
{
    int i, j;
    #ifdef _VERBOSE
    char string_one[128];
    char string_two[128];
    char string[256];
    #endif // _VERBOSE
    int platform_index = 0;
    int device_index = 0;

    const char *source = NULL;

    population = num_members;
    segments = num_values;
    act_metric = metric_type;

    cl_int err;

    cl_uint platformCount;
    cl_uint deviceCount;
    cl_context_properties properties[3];

    // Probe platforms
    clGetPlatformIDs(0, NULL, &platformCount);
    platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * platformCount);
    clGetPlatformIDs(platformCount, platforms, NULL);

    #ifdef _VERBOSE
    for (i = 0; i < platformCount; i++)
    {
        printf("platform %d\n", i);

        // get all devices
        clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
        devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
        clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);

        for (j = 0; j < deviceCount; j++)
        {
            clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 128, string_one, NULL);
            clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 128, string_two, NULL);

            sprintf(string, "%s (version %s)", string_one, string_two);

            printf("  device %d: %s\n", j, string);
        }

        free(devices);
    }
    #endif // _VERBOSE

    if (platformCount == 0)
    {
        fprintf(stderr, "OpenCL platform not found\n");
        return OPENCL_ERROR;
    }

    // ASK user
    do
    {
        #ifdef _VERBOSE
        puts("platform number: ");
        fgets((char *) string, 7, stdin);
        i = strtol(string, NULL, 10);
        #else
        i = 0;
        #endif
    }
    while (i >= platformCount);

    platform_index = i;

    // get all devices
    clGetDeviceIDs(platforms[platform_index], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
    clGetDeviceIDs(platforms[platform_index], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);

    do
    {
        #ifdef _VERBOSE
        puts("device number: ");
        fgets((char *) string, 7, stdin);
        j = strtol(string, NULL, 10);
        #else
        j = 0;
        #endif
    }
    while (j >= deviceCount);

    device_index = j;

    // load values to dynamic memory
    for (i = 0; i < segments; i++)
        max_seg_vals = max_seg_vals > values[i].cvals ? max_seg_vals : values[i].cvals;

    mvalue *seg_vals = (mvalue *) malloc(sizeof(mvalue) * max_seg_vals * segments);
    memset(seg_vals, 0, sizeof(mvalue) * max_seg_vals * segments); // initialize

    for (i = 0; i < segments; i++)
        memcpy(seg_vals + i * max_seg_vals, values[i].vals, sizeof(mvalue) * values[i].cvals);

    // create lenghts array
    int *lenghts = (int *) malloc(sizeof(int) * segments);

    for (i = 0; i < segments; i++)
        lenghts[i] = values[i].cvals;

    // read kernels
    source = read_source_file("fitness.cl");

    // context properties list - must be terminated with 0
    properties[0]= CL_CONTEXT_PLATFORM; // specifies the platform to use
    properties[1]= (cl_context_properties) platforms[platform_index];
    properties[2]= 0;

    // create context
    context = clCreateContext(properties,deviceCount,devices,NULL,NULL,&err);
    if (err != CL_SUCCESS)
    {
        printf("chyba ve vytváření kontextu %d\n", err);
    }

    // create command queue
    command_queue = clCreateCommandQueueWithProperties(context, devices[device_index], 0, &err);
    if (err != CL_SUCCESS)
    {
        printf("chyba ve vytváření fronty úloh %d\n", err);
    }

    program = clCreateProgramWithSource(context, 1, &source, 0, &err);

    err = clBuildProgram(program, 1, devices + device_index, "-I.", NULL, NULL);

    if (err != CL_SUCCESS)
    {
        // Determine the size of the log
        size_t log_size;
        clGetProgramBuildInfo(program, devices[device_index], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

        // Allocate memory for the log
        char *log = (char *) malloc(log_size);

        // Get the log
        clGetProgramBuildInfo(program, devices[device_index], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

        // Print the log
        printf("%s\n", log);
        free(log);
        clReleaseCommandQueue(command_queue);
        clReleaseContext(context);
        free(devices);
        free(platforms);
        return 1;
    }

    // specify which kernel from the program to execute
    kernel_population = clCreateKernel(program, "kernel_population", &err);
    kernel_equation = clCreateKernel(program, "solve_equation", &err);
    kernel_avg = clCreateKernel(program, "solve_avg", &err);

    free((void *) source);

    buf_seg_vals = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(mvalue) * max_seg_vals * segments, seg_vals, NULL);
    buf_lenghts = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * segments, lenghts, NULL);
    buf_members = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_float16) * population, members, NULL);
    buf_members_new = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_float16) * population, members, NULL);

    buf_seg_vals_res = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * max_seg_vals * segments * population, NULL, NULL);

    free(seg_vals);
    free(lenghts);

    // set the argument list for the kernel command
    clSetKernelArg(kernel_population, 0, sizeof(cl_mem), &buf_members);
    clSetKernelArg(kernel_population, 1, sizeof(cl_mem), &buf_members_new);

    clSetKernelArg(kernel_equation, 0, sizeof(int), &segments);
    clSetKernelArg(kernel_equation, 1, sizeof(cl_mem), &buf_seg_vals);
    clSetKernelArg(kernel_equation, 2, sizeof(cl_mem), &buf_lenghts);
    clSetKernelArg(kernel_equation, 3, sizeof(int), &population);
    clSetKernelArg(kernel_equation, 4, sizeof(cl_mem), &buf_members_new);
    clSetKernelArg(kernel_equation, 5, sizeof(cl_mem), &buf_seg_vals_res);
    clSetKernelArg(kernel_equation, 6, sizeof(char), &act_metric);

    clSetKernelArg(kernel_avg, 0, sizeof(int), &max_seg_vals);
    clSetKernelArg(kernel_avg, 1, sizeof(int), &segments);
    clSetKernelArg(kernel_avg, 2, sizeof(cl_mem), &buf_seg_vals_res);
    clSetKernelArg(kernel_avg, 3, sizeof(cl_mem), &buf_lenghts);
    clSetKernelArg(kernel_avg, 4, sizeof(cl_mem), &buf_members);
    clSetKernelArg(kernel_avg, 5, sizeof(cl_mem), &buf_members_new);
    clSetKernelArg(kernel_avg, 6, sizeof(char), &act_metric);

    three_dim[0] = max_seg_vals;
    three_dim[1] = segments;
    three_dim[2] = population;

    one_dim[0] = population;

    return 0;
}
Example #10
0
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA,
                        const void *inMatrixB, int nrowB, int ncolB, bool transposeB,
                        double alpha, double beta, void *outMatrix, bool use_float)
{
    std::stringstream result;
    
    float *input_matrixA_f = (float *)inMatrixA;
    float *input_matrixB_f = (float *)inMatrixB;
    
    float *output_matrix_f = (float *)outMatrix;
    
    double *input_matrixA_d = (double *)inMatrixA;
    double *input_matrixB_d = (double *)inMatrixB;
    
    double *output_matrix_d = (double *)outMatrix;
    
    if (debug) {
        result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") <<
        ")" << std::endl << std::endl;
    }
    
    cl_int err = CL_SUCCESS;
    
    clblasStatus status = clblasSetup();
    if (status != CL_SUCCESS) {
        if (debug) {
            result << "clblasSetup: " << clblasErrorToString(status) << std::endl;
        }
        
        err = CL_INVALID_OPERATION;
    }
    
    // get first platform
    cl_platform_id platform = NULL;
    if (err == CL_SUCCESS) {
        err = clGetPlatformIDs(1, &platform, NULL);
    }
    
    if (debug && err == CL_SUCCESS) {
        result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl;
        result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl;
    }
    
    // context
    cl_context context = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateContext:" << std::endl;
        }
        
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    }
    
    // queue
    cl_command_queue queue = NULL;
    if (err == CL_SUCCESS) {
#ifdef CL_VERSION_2_0
        if (debug) {
            result << "clCreateCommandQueueWithProperties:" << std::endl;
        }
        
        queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
        
#else
        if (debug) {
            result << "clCreateCommandQueue:" << std::endl;
        }
        
        queue = clCreateCommandQueue(context, device, 0, &err);
#endif
    }
    
    // buffers
    cl_mem cl_input_matrixA = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixA:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(float), input_matrixA_f, &err);
            
        } else {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(double), input_matrixA_d, &err);
        }
    }
    
    cl_mem cl_input_matrixB = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixB:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(float), input_matrixB_f, &err);
            
        } else {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(double), input_matrixB_d, &err);
        }
    }
    
    int nrowC = transposeA ? ncolA : nrowA;
    int ncolC = transposeB ? nrowB : ncolB;
    cl_mem cl_output_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_output_vector:" << std::endl;
        }
        
        if (use_float) {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(float), output_matrix_f, &err);
            
        } else {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(double), output_matrix_d, &err);
        }
        
    }
    
    // ++++++++++++
    const int lda = nrowA;  // first dimension of A (rows), before any transpose
    const int ldb = nrowB;  // first dimension of B (rows), before any transpose
    const int ldc = nrowC;      // first dimension of C (rows)
    
    const int M = transposeA ? ncolA : nrowA;    // rows in A (after transpose, if any) and C
    const int N = transposeB ? nrowB : ncolB;    // cols in B (after transpose, if any) and C
    const int K = transposeA ? nrowA : ncolA;    // cols in A and rows in B (after transposes, if any)
    
    const clblasOrder order = clblasColumnMajor;
    const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans;
    const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans;
    
    cl_event event = NULL;
    
    if (err == CL_SUCCESS) {
        if (use_float) {
            if (debug) {
                result << "clblasSgemm:" << std::endl;
            }
            
            status = clblasSgemm(order, transA, transB, M, N, K,
                              alpha, cl_input_matrixA, 0, lda,
                              cl_input_matrixB, 0, ldb, beta,
                              cl_output_matrix, 0, ldc,
                              1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS && debug) {
                result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
            }
            
        } else {
            if (debug) {
                result << "clblasDgemm:" << std::endl;
            }
            
            status = clblasDgemm(order, transA, transB, M, N, K,
                                 alpha, cl_input_matrixA, 0, lda,
                                 cl_input_matrixB, 0, ldb, beta,
                                 cl_output_matrix, 0, ldc,
                                 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS) {
                if (debug) {
                    result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
                }
                
                err = status;
            }
        }
    }
    
    if (err == CL_SUCCESS) {
        /* Wait for calculations to be finished. */
        if (debug) {
            result << "clWaitForEvents:" << std::endl;
        }
        err = clWaitForEvents(1, &event);
    }
    
    // retrieve result
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "Retrieve result:" << std::endl;
        }
        
        if (use_float) {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL);
            
        } else {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL);
        }
    }
    
    std::string err_str = clErrorToString(err);
    result << std::endl << err_str << std::endl;
    
    // cleanup
    clReleaseMemObject(cl_output_matrix);
    cl_output_matrix = NULL;
    
    clReleaseMemObject(cl_input_matrixA);
    cl_input_matrixA = NULL;
    
    clReleaseMemObject(cl_input_matrixB);
    cl_input_matrixB = NULL;
    
    clReleaseCommandQueue(queue);
    queue = NULL;
    
    clReleaseContext(context);
    context = NULL;
    
    if (debug) {
        CERR << result.str();
    }
    
    ErrorStatus errorStatus = { err, status };
    
    return errorStatus;
}
Example #11
0
enum piglit_result
piglit_cl_test(const int argc,
               const char** argv,
               const struct piglit_cl_api_test_config* config,
               const struct piglit_cl_api_test_env* env)
{
	enum piglit_result result = PIGLIT_PASS;

	int i;
	int mask;
	cl_int errNo;
	cl_context cl_ctx;
	cl_command_queue command_queue;
	cl_uint num_devices;
	cl_device_id* devices;
	cl_command_queue_properties mixed_command_queue_properties[4] =
		{CL_QUEUE_PROPERTIES, 0, 0, 0};

	cl_context_properties context_properties[] = {
		CL_CONTEXT_PLATFORM, (cl_context_properties)env->platform_id,
		0
	};

	int num_command_queue_properties =
		PIGLIT_CL_ENUM_NUM(cl_command_queue_properties, env->version);
	const cl_command_queue_properties* command_queue_properties =
		PIGLIT_CL_ENUM_ARRAY(cl_command_queue_properties);

	/*** Normal usage ***/

	/* create context */
	cl_ctx = clCreateContext(context_properties,
	                         1,
	                         &env->device_id,
	                         NULL,
	                         NULL,
	                         &errNo);
	if(errNo == CL_DEVICE_NOT_FOUND) {
		fprintf(stderr, "No available devices.\n");
		return PIGLIT_SKIP;
	}
	if(!piglit_cl_check_error(errNo, CL_SUCCESS)) {
		fprintf(stderr,
		        "Failed (error code: %s): Create context.\n",
		        piglit_cl_get_error_name(errNo));
		return PIGLIT_FAIL;
	}

	/*
	 * For each command queue properties mix.
	 * There are 2^(num_command_queue_properties)-1 possible options.
	 */
	for(mask = 0; mask < (1 << num_command_queue_properties); mask++) {
		mixed_command_queue_properties[1] =
			get_mixed_command_queue_properties(mask, command_queue_properties);
		if (properties_forbidden(mixed_command_queue_properties[1], env))
			continue;
#if defined CL_VERSION_2_0
		if (env->version >= 20) {
			command_queue = clCreateCommandQueueWithProperties(
			                             cl_ctx,
		                                     env->device_id,
		                                     mixed_command_queue_properties,
		                                     &errNo);
		} else
#endif //CL_VERSION_2_0
		{
			command_queue = clCreateCommandQueue(cl_ctx,
		                                     env->device_id,
		                                     mixed_command_queue_properties[1],
		                                     &errNo);
		}
		if(errNo != CL_SUCCESS && errNo != CL_INVALID_QUEUE_PROPERTIES) {
			piglit_cl_check_error(errNo, CL_SUCCESS);
			fprintf(stderr,
			        "Failed (error code: %s): Create command queue using 0x%X as command queue properties.\n",
			        piglit_cl_get_error_name(errNo),
			        (unsigned int)mixed_command_queue_properties[1]);
			piglit_merge_result(&result, PIGLIT_FAIL);
		}
		clReleaseCommandQueue(command_queue);
	}
	
	/*** Errors ***/
	
	/*
	 * CL_INVALID_CONTEXT if context is not a valid context.
	 */
	clCreateCommandQueue(NULL, env->device_id, 0, &errNo);
	if(!piglit_cl_check_error(errNo, CL_INVALID_CONTEXT)) {
		fprintf(stderr,
		        "Failed (error code: %s): Trigger CL_INVALID_CONTEXT if contest is not a valid context.\n",
		        piglit_cl_get_error_name(errNo));
		piglit_merge_result(&result, PIGLIT_FAIL);
	}

	/*
	 * CL_INVALID_DEVICE if device is not a valid device or is
	 * not associated with context.
	 */
	clCreateCommandQueue(cl_ctx, NULL, 0, &errNo);
	if(!piglit_cl_check_error(errNo, CL_INVALID_DEVICE)) {
		fprintf(stderr,
		        "Failed (error code: %s): Trigger CL_INVALID_DEVICE if device is not a valid device.\n",
		        piglit_cl_get_error_name(errNo));
		piglit_merge_result(&result, PIGLIT_FAIL);
	}

	num_devices = piglit_cl_get_device_ids(env->platform_id,
	                                       CL_DEVICE_TYPE_ALL,
	                                       &devices);
	for(i = 0; i < num_devices; i++) {
		if(devices[i] != env->device_id) {
			clCreateCommandQueue(cl_ctx, devices[i], 0, &errNo);
			if(!piglit_cl_check_error(errNo, CL_INVALID_DEVICE)) {
				fprintf(stderr,
				        "Failed (error code: %s): Trigger CL_INVALID_DEVICE if device that is not associated with context.\n",
				        piglit_cl_get_error_name(errNo));
				piglit_merge_result(&result, PIGLIT_FAIL);
			}
		}
	}
	free(devices);

	/*
	 * CL_INVALID_VALUE if values specified in properties are not valid.
	 */
	clCreateCommandQueue(cl_ctx, env->device_id, 0XFFFFFFFF, &errNo);
	if(!piglit_cl_check_error(errNo, CL_INVALID_VALUE)) {
		fprintf(stderr,
		        "Failed (error code: %s): Trigger CL_INVALID_VALUE if values specified in properties are not valid.\n",
		        piglit_cl_get_error_name(errNo));
		piglit_merge_result(&result, PIGLIT_FAIL);
	}

	/*
	 * CL_INVALID_QUEUE_PROPERTIES if values specified in properties
	 * are valid but are not supported by the device.
	 *
	 * Note: already tested in 'normal usage' section
	 */

	clReleaseContext(cl_ctx);

	return result;
}
Example #12
0
/*
 * This function picks/creates necessary OpenCL objects which are needed.
 * The objects are:
 * OpenCL platform, device, context, and command queue.
 *
 * All these steps are needed to be performed once in a regular OpenCL application.
 * This happens before actual compute kernels calls are performed.
 *
 * For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t,
 * so this function populates fields of this structure, which is passed as parameter ocl.
 * Please, consider reviewing the fields before going further.
 * The structure definition is right in the beginning of this file.
 */
int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
{
    // The following variable stores return codes for all OpenCL calls.
    cl_int err = CL_SUCCESS;

    // Query for all available OpenCL platforms on the system
    // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
    cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
    if (NULL == platformId)
    {
        LogError("Error: Failed to find OpenCL platform.\n");
        return CL_INVALID_VALUE;
    }

    // Create context with device of specified type.
    // Required device type is passed as function argument deviceType.
    // So you may use this function to create context for any CPU or GPU OpenCL device.
    // The creation is synchronized (pfn_notify is NULL) and NULL user_data
    cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 };
    ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err);
    if ((CL_SUCCESS != err) || (NULL == ocl->context))
    {
        LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err));
        return err;
    }

    // Query for OpenCL device which was used for context creation
    err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err));
        return err;
    }

    // Read the OpenCL platform's version and the device OpenCL and OpenCL C versions
    GetPlatformAndDeviceVersion(platformId, ocl);

    // Create command queue.
    // OpenCL kernels are enqueued for execution to a particular device through special objects called command queues.
    // Command queue guarantees some ordering between calls and other OpenCL commands.
    // Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device.
#ifdef CL_VERSION_2_0
    if (OPENCL_VERSION_2_0 == ocl->deviceVersion)
    {
        const cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 };
        ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err);
    }
    else {
        // default behavior: OpenCL 1.2
        cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
        ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
    }
#else
    // default behavior: OpenCL 1.2
    cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
    ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
#endif
    if (CL_SUCCESS != err)
    {
        LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err));
        return err;
    }

    return CL_SUCCESS;
}
Example #13
0
int main(int argc, char *argv[])
{
    int i;
    
    int n = 5;
	int outSize = 7;
    /* A, B, C, D, E */
    float p0[n], p1[n];
    int np[n];
    float out[outSize];

    if (argc != 21) {
        fprintf(stderr, "Usage: %s M0 H0 T0 TAU A0 A1 NA B0 B1 NB "
            "C0 C1 NC D0 D1 ND E0 E1 NE INPUT\n", argv[0]);
        exit(1);
    }


    float m0 = atof(argv[1]);
    float h0 = atof(argv[2]);
    float t0 = atof(argv[3]);
    float tau = strtof(argv[4], NULL);

    /* p0 is where the search starts, p1 is where the search ends and np is the 
     * number of points in between p0 and p1 to do the search */   
    for (i = 0; i < 5; i++) {
        p0[i] = atof(argv[5 + 3*i]);
        p1[i] = atof(argv[5 + 3*i + 1]);
        np[i] = atoi(argv[5 + 3*i + 2]);
    }

    /* Load the traces from the file */

    char *path = argv[20];
    FILE *fp = fopen(path, "r");

    if (!fp) {
        fprintf(stderr, "Failed to open prestack file '%s'!\n", path);
        return 1;
    }

    su_trace_t tr;
    vector_t(su_trace_t) traces;
    vector_init(traces);

    while (su_fgettr(fp, &tr)) {
        vector_push(traces, tr);
    }

    /* Construct the aperture structure from the traces, which is a vector
     * containing pointers to traces */

    aperture_t ap;
    ap.ap_m = 0;
    ap.ap_h = 0;
    ap.ap_t = tau;
    vector_init(ap.traces);
    for (int i = 0; i < traces.len; i++)
        vector_push(ap.traces, &vector_get(traces, i));

    my_aperture_t my_ap = transform(ap);
	//puts("fim transform\n");

    /*-------------------------------------------------------------------------*/

    char *kernelSource = (char *) malloc(MAXSOURCE * sizeof(char));
    
    FILE * file = fopen("kernel.cl", "r");
    if(file == NULL) {
        printf("Error: open the kernel file (kernel.cl)\n");
        exit(1);
    }
    
    // Read kernel code
    size_t source_size = fread(kernelSource, 1, MAXSOURCE, file);
    
    //Device input buffers
    cl_mem d_my_ap;
    cl_mem d_p0, d_p1, d_np, d_aopt, d_bopt, d_copt, d_dopt, d_eopt, d_stack, d_smax;
    //Device output buffer
    cl_mem  d_out;
    
    cl_int err;

    char deviceName[MAX_DEVICE_NAME_SIZE];
    cl_platform_id cpPlatform;
    cl_device_id device_id;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_platform_id *platforms;
    cl_uint platformCount;
    
    //Tamanho em bytes de cada vetor
    size_t bytes_my_ap = sizeof(my_aperture_t);
    size_t bytes_p0 = sizeof(float) * n;
    size_t bytes_p1 = sizeof(float) * n;
    size_t bytes_np = sizeof(int) * n;
    size_t bytes_opt = sizeof(float) * np[0];
	size_t bytes_out = sizeof(float) * outSize;
    

	//Numero de workitems em cada local work group (local size)
//    size_t localSize[3] = {LOCALSIZE, LOCALSIZE, LOCALSIZE};
//
//    size_t globalSize[3] = {
//        ceil((float)np[0] / (float)localSize[0]),
//        ceil((float)np[1] / (float)localSize[1]),
//        ceil((float)np[2] / (float)localSize[2])
//    };

    size_t localSize[3] = {2,2,2};

    size_t globalSize[3] = {20,20,20};
    

    // Bind to platforms
	clGetPlatformIDs(0, NULL, &platformCount);
	if (platformCount == 0) {
		printf("Error, cound not find any OpenCL platforms on the system.\n");
		exit (2);
	}
	
	platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
	clGetPlatformIDs(platformCount,platforms, NULL);
	
	// Find first device that works
	err = 1;
	for (i = 0; i < platformCount && err !=CL_SUCCESS; i++) {
		// Get ID for the device (CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_GPU, ...)
		err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);

	}
	
	checkError(err, "get device");

	if (err !=CL_SUCCESS) {
		printf("Error, could not find a valid device.");
		exit (3);
	}
	
	err = clGetDeviceInfo(device_id, CL_DEVICE_NAME,MAX_DEVICE_NAME_SIZE, deviceName, NULL);
	printf("Device: %s \n",deviceName);
	
	if (err !=CL_SUCCESS) {
		printf("Error, could not read the info for device.");
		exit (4);
	}
	
	// Create a context
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	
	if (err !=CL_SUCCESS) {
		printf("Error, could not create the context.");
		exit (5);
	}
	
	// Create a command queue
	queue = clCreateCommandQueueWithProperties(context, device_id, 0, &err);
	
	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1,
			(const char **) & kernelSource,(const size_t *) &source_size, &err);
			
	if (err !=CL_SUCCESS) {
		printf("Error, could not create program with source.");
		exit (6);
	}
			
	// Build the program executable " --disable-multilib "
	err = clBuildProgram(program, 0,NULL, NULL, NULL, NULL);
	if (err == CL_BUILD_PROGRAM_FAILURE) {
		cl_int logStatus;
		char* buildLog = NULL;
		size_t buildLogSize = 0;
		logStatus = clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, NULL, &buildLogSize);
		buildLog = (char*)malloc(buildLogSize);
		memset(buildLog, 0, buildLogSize);
		logStatus = clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL);
		printf("ERROR %d (logsz = %d): [[%s]]\n", err, buildLogSize, buildLog);
		free(buildLog);
		return err;
	} else if (err!=0) {
		printf("Error, could not build program.\n");
		exit (7);
	}
	
	// Create the compute kernel in the program we wish to run
	
	kernel = clCreateKernel(program, "calculate", &err);
	
	if (err !=CL_SUCCESS) {
		printf("Error, could not create the kernel.");
		exit (6);
	}
	

	float smax[np[0]];
	for(int i = 0; i < np[0]; i++){
		smax[i] = -1.0;
	}
	size_t bytes_smax = sizeof(float) * np[0];

	// Create the input and output arrays in device memory for our calculation
	d_my_ap = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_my_ap, NULL, NULL);
	d_p0 = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_p0, NULL, NULL);
	d_p1 = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_p1, NULL, NULL);
	d_np = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_np, NULL, NULL);
	d_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes_out, NULL, NULL);

	d_aopt = clCreateBuffer(context, CL_MEM_READ_WRITE , bytes_smax, NULL, NULL);
	d_bopt = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL);
	d_copt = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL);
	d_dopt = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL);
	d_eopt = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL);
	d_stack = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL);
	d_smax = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_smax, NULL, NULL);

	// Write our data set into the input array in device memory

	err = clEnqueueWriteBuffer(queue, d_my_ap, CL_TRUE, 0, bytes_my_ap, (const void*)&my_ap, 0, NULL, NULL);
	err |= clEnqueueWriteBuffer(queue, d_p0, CL_TRUE, 0, bytes_p0, p0, 0, NULL, NULL);
	err |= clEnqueueWriteBuffer(queue, d_p1, CL_TRUE, 0, bytes_p1, p1, 0, NULL, NULL);
	err |= clEnqueueWriteBuffer(queue, d_np, CL_TRUE, 0, bytes_np, np, 0, NULL, NULL);
	err |= clEnqueueWriteBuffer(queue, d_smax, CL_TRUE, 0, bytes_smax, smax, 0, NULL, NULL);

	// Set the arguments to our compute kernel
	err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_my_ap);
	err |= clSetKernelArg(kernel, 1, sizeof(float), &m0);
	err |= clSetKernelArg(kernel, 2, sizeof(float), &h0);
	err |= clSetKernelArg(kernel, 3, sizeof(float), &t0);
	err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_p0);
	err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_p1);
	err |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &d_np);
	err |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &d_out);
	err |= clSetKernelArg(kernel, 8, np[0] * sizeof(cl_float), &d_aopt);//_Aopt
	err |= clSetKernelArg(kernel, 9, np[0] * sizeof(cl_float), &d_bopt);//_Bopt
	err |= clSetKernelArg(kernel, 10, np[0] * sizeof(cl_float), &d_copt);//_Copt
	err |= clSetKernelArg(kernel, 11, np[0] * sizeof(cl_float), &d_dopt);//_Dopt
	err |= clSetKernelArg(kernel, 12, np[0] * sizeof(cl_float), &d_eopt);//_Eopt
	err |= clSetKernelArg(kernel, 13, np[0] * sizeof(cl_float), &d_stack);//_stack
	err |= clSetKernelArg(kernel, 14, np[0] * sizeof(cl_float), &d_smax);//smax
	

	
	if (err !=CL_SUCCESS) {
		printf("Error, could not set kernel args.");
		exit (7);
	}
	
	err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, (const size_t *)globalSize,  (const size_t *)localSize, 0, NULL, NULL);
	// Execute the kernel over the entire range of the data set
	
	if (err !=CL_SUCCESS) {
		printf("Error, could not enqueue commands. %d\n", err);
		exit (8);
	}
	
	// Wait for the command queue to get serviced before reading back results
	clFinish(queue);
	
	// Read the results from the device
	clEnqueueReadBuffer(queue, d_out, CL_TRUE, 0, bytes_out, out, 0, NULL, NULL );

	/*-------------------------------------------------------------------------*/

    printf("A=%g\n", out[0]);
    printf("B=%g\n", out[1]);
    printf("C=%g\n", out[2]);
    printf("D=%g\n", out[3]);
    printf("E=%g\n", out[4]);
    printf("Stack=%g\n", out[5]);
    printf("Semblance=%g\n", out[6]);
    printf("\n");

    return 0;
}
Example #14
0
int main(int argc, char** argv) {
    // beginning of the verbose OpenCL allocation
    cl_platform_id platform_id = NULL;
    cl_uint ret_num_platforms = 0;
    cl_uint ret_num_devices = 0;
    cl_int ret = 0;

    // the output from opencl kernel
    float *c_inputs = malloc(ARRAY_SIZE*sizeof(float));
    float *c_outputs = malloc(ARRAY_SIZE*sizeof(float));
    cl_float *cl_inputs = malloc(ARRAY_SIZE*sizeof(cl_float));
    cl_float *cl_outputs = malloc(ARRAY_SIZE*sizeof(cl_float));

    // get random numbers via Rmath
    set_seed(atoi(argv[1]), 197414);
    float tmp_in = 0.0;

    #pragma omp parallel for
    for (long i = 0; i < ARRAY_SIZE; i++) {
        tmp_in = rnorm(0, 1);
        c_inputs[i] = tmp_in;
        cl_inputs[i] = (cl_float) tmp_in;
    }

    // measure time elapse
    clock_t start = clock();
    #pragma omp parallel for
    for (long i = 0; i < ARRAY_SIZE; i++) {
        c_outputs[i] = expf(c_inputs[i]);
    }
    printf("CPU time for %d exp operation: %d\n", ARRAY_SIZE, (int) (clock() - start));

    // read kernel source
    FILE *fp;
    char filename[] = "./hello_log.cl";
    char *source_str;
    size_t source_size;
    fp = fopen(filename, "r");
    source_str = (char*) malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str,
                        1,
                        MAX_SOURCE_SIZE,
                        fp);
    fclose(fp);

    // get platform and device info
    ret = clGetPlatformIDs(1,
                           &platform_id,
                           &ret_num_platforms);
    cl_device_id device_ids[2];
    ret = clGetDeviceIDs(platform_id,
                         CL_DEVICE_TYPE_GPU,
                         2,
                         device_ids,
                         &ret_num_devices);
    printf("Number of devices: %5d\n", ret_num_devices);

    // print device name
    char bdname[100];
    clGetDeviceInfo(device_ids[1], CL_DEVICE_NAME, 100, bdname, NULL);
    printf("Used device: %s\n", bdname);

    // use second GPU
    cl_device_id device_id = device_ids[1];

    // create opencl context
    cl_context context = clCreateContext(NULL,
                                         1,
                                         &device_id,
                                         NULL,
                                         NULL,
                                         &ret);


    // create command queue
    cl_command_queue command_queue = clCreateCommandQueueWithProperties(context,
                                     device_id,
                                     0,
                                     &ret);

    // create memory buffer for input
    cl_mem memobj_in = clCreateBuffer(context,
                                      CL_MEM_READ_WRITE,
                                      ARRAY_SIZE*sizeof(cl_float),
                                      NULL,
                                      &ret);

    // create memory buffer for output
    cl_mem memobj_out = clCreateBuffer(context,
                                       CL_MEM_READ_WRITE,
                                       ARRAY_SIZE*sizeof(cl_float),
                                       NULL,
                                       &ret);

    // create kernel program
    cl_program program = clCreateProgramWithSource(context,
                         1,
                         (const char **)&source_str,
                         (const size_t *)&source_size,
                         &ret);

    // build program
    ret = clBuildProgram(program,
                         1,
                         &device_id,
                         NULL,
                         NULL,
                         NULL);
    printf("build program successfully\n");

    // create opencl kernel
    cl_kernel kernel = clCreateKernel(program,
                                      "hello_exp",
                                      &ret);

    // set opencl parameters for inputs
    ret = clSetKernelArg(kernel,
                         0,
                         sizeof(cl_mem),
                         (void *)&memobj_in);

    // set opencl parameters for inputs
    ret = clSetKernelArg(kernel,
                         1,
                         sizeof(cl_mem),
                         (void *)&memobj_out);

    // execute opencl kernel
    size_t global_item_size = ARRAY_SIZE/32;
    size_t local_item_size = 32;

    // measure time
    start = clock();
    ret = clEnqueueWriteBuffer(command_queue,
                               memobj_in,
                               CL_TRUE,
                               0,
                               ARRAY_SIZE*sizeof(cl_float),
                               cl_inputs,
                               0,
                               NULL,
                               NULL);
    // run it
    ret = clEnqueueNDRangeKernel(command_queue,
                                 kernel,
                                 1,
                                 NULL,
                                 &global_item_size,
                                 &local_item_size,
                                 0,
                                 NULL,
                                 NULL);

    // copy results from the memory buffer
    ret = clEnqueueReadBuffer(command_queue,
                              memobj_out,
                              CL_TRUE,
                              0,
                              ARRAY_SIZE*sizeof(cl_float),
                              cl_outputs,
                              0,
                              NULL,
                              NULL);
    printf("GPU time (with PCI-E overhead): %d\n", (int) (clock() - start));
    printf("inputs: %3.7f  %3.7f\n", c_inputs[150000], cl_inputs[150000]);
    printf("outputs: %3.7f  %3.7f\n", c_outputs[150000], (float) cl_outputs[150000]);

    // finalization
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj_in);
    ret = clReleaseMemObject(memobj_out);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(source_str);

    return 0;
}
Example #15
0
int main(int argc, char **argv) {




	if (find_option(argc, argv, "-h") >= 0)
	{
		printf("Options:\n");
		printf("-h to see this help\n");
		printf("-n <int> to set the number of particles\n");
		printf("-o <filename> to specify the output file name\n");
		printf("-s <filename> to specify the summary output file name\n");
		return 0;
	}


	int n = read_int(argc, argv, "-n", 1000);

	char *savename = read_string(argc, argv, "-o", NULL);
	char *sumname = read_string(argc, argv, "-s", NULL);

	// For return values.
	cl_int ret;

	// OpenCL stuff.
	// Loading kernel files.
	FILE *kernelFile;
	char *kernelSource;
	size_t kernelSize;

	kernelFile = fopen("simulationKernel.cl", "r");

	if (!kernelFile) {
		fprintf(stderr, "No file named simulationKernel.cl was found\n");
		exit(-1);
	}
	kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
	kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile);
	fclose(kernelFile);

	// Getting platform and device information
	cl_platform_id platformId = NULL;
	cl_device_id deviceID = NULL;
	cl_uint retNumDevices;
	cl_uint retNumPlatforms;
	ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms);
	// Different types of devices to pick from. At the moment picks the default opencl device.
	//CL_DEVICE_TYPE_GPU
	//CL_DEVICE_TYPE_ACCELERATOR
	//CL_DEVICE_TYPE_DEFAULT
	//CL_DEVICE_TYPE_CPU
	ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices);

	// Max workgroup size
	size_t max_available_local_wg_size;
	ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL);
	// Creating context.
	cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret);


	// Creating command queue
        cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret);
	
	// Build program
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret);
//	printf("program = ret %i \n", ret);
	ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
//	printf("clBuildProgram: ret %i \n", ret);
	
	// Create kernels
	cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret);

	cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret);

	cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret);
	cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret);

	FILE *fsave = savename ? fopen(savename, "w") : NULL;
	FILE *fsum = sumname ? fopen(sumname, "a") : NULL;
	particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t));

	// GPU particle data structure
	cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret);

	// Set size
	set_size(n);

	init_particles(n, particles);

	double copy_time = read_timer();

	// Copy particles to device.
	ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL);
	copy_time = read_timer() - copy_time;
	

	// Calculating thread and thread block counts.
	// sizes
	size_t globalItemSize;
	size_t localItemSize;
	// Global item size
	if (n <= NUM_THREADS) {
		globalItemSize = NUM_THREADS;
		localItemSize = 16;
	}
	else if (n % NUM_THREADS != 0) {
		globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS;
	}
	else {
		globalItemSize = n;
	}

	// Local item size
	localItemSize = globalItemSize / NUM_THREADS;	

	// Bins and bin sizes.
	// Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10.
	// There will never be 10 particles in one bin.
	int maxParticles = 10;
	
	// Calculating the number of bins.
	int numberOfBins = (int)ceil(size/(2*cutoff)) + 2;
	
	// Bins will only exist on the device.
	particle_t* bins;
	
	// How many particles are there in each bin - also only exists on the device.
	volatile int* binSizes;
	
	// Number of bins to be initialized.
	size_t clearAmt = numberOfBins*numberOfBins;
	
	// Allocate memory for bins on the device.
	cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret);
	cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret);
	
	// SETTING ARGUMENTS FOR THE KERNELS
	
	// Set arguments for the init / clear kernel
	ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins);

	// Set arguments for the binning kernel
	ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(binKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins);

	// Set arguments for force kernel.
	ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins);


	// Set arguments for move kernel
	ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size);
	
	
	// Variable to check if kernel execution is done.
	cl_event kernelDone;
	
	
	double simulation_time = read_timer();
	int step = 0;
	for (step = 0; step < NSTEPS; step++) {


		// Execute bin initialization (clearing after first iteration)
		ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute binning kernel
		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
//		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);	
		// Execute force kernel
		ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute move kernel
		ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);

		if (fsave && (step%SAVEFREQ) == 0) {
			// Copy the particles back to the CPU
			ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone);
			ret = clWaitForEvents(1, &kernelDone);

			save(fsave, n, particles);
		}

	}
	simulation_time = read_timer() - simulation_time;
	printf("CPU-GPU copy time = %g seconds\n", copy_time);
	printf("n = %d, simulation time = %g seconds\n", n, simulation_time);

	if (fsum)
		fprintf(fsum, "%d %lf \n", n, simulation_time);

	if (fsum)
		fclose(fsum);
	free(particles);
	if (fsave)
		fclose(fsave);


	ret = clFlush(commandQueue);
	ret = clFinish(commandQueue);
	ret = clReleaseCommandQueue(commandQueue);
	ret = clReleaseKernel(forceKernel);
	ret = clReleaseKernel(moveKernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(d_particles);
	ret = clReleaseContext(context);


	return 0;
}
Example #16
0
ErrorStatus crossprod_clblas(cl_device_id device, void *inMatrix, void *outMatrix, int nrow, int ncol, bool use_float)
{
    std::stringstream result;
    
    float *input_matrix_f = (float *)inMatrix;
    
    float *output_matrix_f = (float *)outMatrix;
    
    double *input_matrix_d = (double *)inMatrix;
    
    double *output_matrix_d = (double *)outMatrix;
    
    if (debug) {
        result << "crossprod_clblas( " << (use_float ? "FLOAT" : "DOUBLE") <<
        ", nrow = " << nrow << ", ncol = " << ncol << ")" << std::endl << std::endl;
    }
    
    cl_int err = CL_SUCCESS;

    clblasStatus status = clblasSetup();
    if (status != CL_SUCCESS) {
        if (debug) {
            result << "clblasSetup: " << clblasErrorToString(status) << std::endl;
        }
        
        err = CL_INVALID_OPERATION;
    }

    // get first platform
    cl_platform_id platform = NULL;
    if (err == CL_SUCCESS) {
        err = clGetPlatformIDs(1, &platform, NULL);
    }
    
    if (debug && err == CL_SUCCESS) {
        result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl;
        result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl;
    }
    
    // context
    cl_context context = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateContext:" << std::endl;
        }
        
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    }
    
    // queue
    cl_command_queue queue = NULL;
    if (err == CL_SUCCESS) {
#ifdef CL_VERSION_2_0
        if (debug) {
            result << "clCreateCommandQueueWithProperties:" << std::endl;
        }
        
        queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
        
#else
        if (debug) {
            result << "clCreateCommandQueue:" << std::endl;
        }
        
        queue = clCreateCommandQueue(context, device, 0, &err);
#endif
    }
    
    // buffers
    cl_mem cl_input_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrix:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                             nrow * ncol * sizeof(float), input_matrix_f, &err);
            
        } else {
            cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                             nrow * ncol * sizeof(double), input_matrix_d, &err);
        }
    }
    
    cl_mem cl_output_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_output_vector:" << std::endl;
        }

        if (use_float) {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              ncol * ncol * sizeof(float), output_matrix_f, &err);
            
        } else {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              ncol * ncol * sizeof(double), output_matrix_d, &err);
        }

    }
    
    // ++++++++++++
    const clblasOrder order = clblasColumnMajor;
    const clblasTranspose transA = clblasTrans;
    
    const size_t lda = nrow;
    const size_t ldc = ncol;
    
    const cl_float alpha = 1.0;
    
    clblasUplo uplo = clblasUpper;
    
    cl_event event = NULL;
    
    if (err == CL_SUCCESS) {
        if (use_float) {
            if (debug) {
                result << "clblasSsyrk:" << std::endl;
            }
            
            status = clblasSsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0,
                                 cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS && debug) {
                result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
            }

        } else {
            if (debug) {
                result << "clblasDsyrk:" << std::endl;
            }
            
            status = clblasDsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0,
                                 cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS) {
                if (debug) {
                    result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
                }
                
                err = status;
            }
        }
    }
    
    if (err == CL_SUCCESS) {
        /* Wait for calculations to be finished. */
        if (debug) {
            result << "clWaitForEvents:" << std::endl;
        }
        err = clWaitForEvents(1, &event);
    }
    
    // retrieve result
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "Retrieve result:" << std::endl;
        }
        
        if (use_float) {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(float), output_matrix_f, 0, NULL, NULL);
            symmetrizeSquare_f(output_matrix_f, ncol);
            
        } else {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(double), output_matrix_d, 0, NULL, NULL);
            symmetrizeSquare_d(output_matrix_d, ncol);
        }
    }
    
    std::string err_str = clErrorToString(err);
    result << std::endl << err_str << std::endl;
    
    // cleanup
    clReleaseMemObject(cl_output_matrix);
    cl_output_matrix = NULL;
    
    clReleaseMemObject(cl_input_matrix);
    cl_input_matrix = NULL;
    
    clReleaseCommandQueue(queue);
    queue = NULL;
    
    clReleaseContext(context);
    context = NULL;
    
    if (debug) {
        CERR << result.str();
    }
    
    ErrorStatus errorStatus = { err, status };
    
//    return status != CL_SUCCESS ? clblasErrorToString(status) : clErrorToString(err);
    return errorStatus;
}
Example #17
0
int
main(int argc, char const * argv[])
{
  char const * const target_platform_substring = "Intel";
  char const * const target_device_substring   = "Graphics";

  //
  // find platform and device ids
  //
  cl_platform_id platform_id;
  cl_device_id   device_id;

#define HS_DEVICE_NAME_SIZE  64

  char   device_name[HS_DEVICE_NAME_SIZE];
  size_t device_name_size;

  cl(FindIdsByName(target_platform_substring,
                   target_device_substring,
                   &platform_id,
                   &device_id,
                   HS_DEVICE_NAME_SIZE,
                   device_name,
                   &device_name_size,
                   true));
  //
  // create context
  //
  cl_context_properties context_properties[] =
    {
      CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id,
      0
    };

  cl_int     cl_err;
  cl_context context = clCreateContext(context_properties,
                                       1,
                                       &device_id,
                                       NULL,
                                       NULL,
                                       &cl_err);
  cl_ok(cl_err);

  //
  // create command queue
  //
#if 0 // OPENCL 2.0

  cl_queue_properties props[] = {
    CL_QUEUE_PROPERTIES,
    (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
#ifndef NDEBUG
    (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
#endif
    0
  };

  cl_queue_properties props_profile[] = {
    CL_QUEUE_PROPERTIES,
    (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
    0
  };

  cl_command_queue cq = clCreateCommandQueueWithProperties(context,
                                                           device_id,
                                                           props,
                                                           &cl_err); cl_ok(cl_err);

  cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context,
                                                                   device_id,
                                                                   props_profile,
                                                                   &cl_err); cl_ok(cl_err);
#else // OPENCL 1.2

  cl_command_queue cq = clCreateCommandQueue(context,
                                             device_id,
#ifndef NDEBUG
                                             CL_QUEUE_PROFILING_ENABLE |
#endif
                                             CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
                                             &cl_err); cl_ok(cl_err);

  cl_command_queue cq_profile = clCreateCommandQueue(context,
                                                     device_id,
                                                     CL_QUEUE_PROFILING_ENABLE,
                                                     &cl_err); cl_ok(cl_err);
#endif

  //
  // Intel GEN workaround -- create dummy kernel for semi-accurate
  // profiling on an out-of-order queue.
  //
  hs_dummy_kernel_create(context,device_id);

  //
  // select the target
  //

  uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0);

  struct hs_cl_target const * hs_target;

  if (key_val_words == 1)
    hs_target = &hs_intel_gen8_u32;
  else
    hs_target = &hs_intel_gen8_u64;

  //
  // create kernels
  //
  fprintf(stdout,"Creating... ");

  struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id);

  fprintf(stdout,"done.\n");

  //
  //
  //

#ifdef NDEBUG
#define HS_BENCH_LOOPS   100
#define HS_BENCH_WARMUP  100
#else
#define HS_BENCH_LOOPS   1
#define HS_BENCH_WARMUP  0
#endif

  //
  // sort sizes and loops
  //
  uint32_t const kpb        = hs_target->config.slab.height << hs_target->config.slab.width_log2;

  uint32_t const count_lo   = (argc <= 2) ? kpb             : strtoul(argv[2],NULL,0);
  uint32_t const count_hi   = (argc <= 3) ? count_lo        : strtoul(argv[3],NULL,0);
  uint32_t const count_step = (argc <= 4) ? count_lo        : strtoul(argv[4],NULL,0);
  uint32_t const loops      = (argc <= 5) ? HS_BENCH_LOOPS  : strtoul(argv[5],NULL,0);
  uint32_t const warmup     = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0);
  bool     const linearize  = (argc <= 7) ? true            : strtoul(argv[7],NULL,0);

  //
  // labels
  //
  fprintf(stdout,
          "Device, "
          "Driver, "
          "Type, "
          "Slab/Linear, "
          "Verified?, "
          "Keys, "
          "Keys Padded In, "
          "Keys Padded Out, "
          "CPU Algorithm, "
          "CPU Msecs, "
          "CPU Mkeys/s, "
          "Trials, "
          "Avg. Msecs, "
          "Min Msecs, "
          "Max Msecs, "
          "Avg. Mkeys/s, "
          "Max. Mkeys/s\n");

  //
  // we want to track driver versions
  //
  size_t driver_version_size;

  cl(GetDeviceInfo(device_id,
                   CL_DRIVER_VERSION,
                   0,
                   NULL,
                   &driver_version_size));

  char * const driver_version = ALLOCA_MACRO(driver_version_size);

  cl(GetDeviceInfo(device_id,
                   CL_DRIVER_VERSION,
                   driver_version_size,
                   driver_version,
                   NULL));
  //
  // benchmark
  //
  hs_bench(context,
           cq,cq_profile,
           device_name,
           driver_version,
           hs_target->config.words.key + hs_target->config.words.val,
           1 << hs_target->config.slab.width_log2,
           hs_target->config.slab.height,
           hs,
           count_lo,
           count_hi,
           count_step,
           loops,
           warmup,
           linearize);

  //
  // release everything
  //
  hs_cl_release(hs);

  hs_dummy_kernel_release();

  cl(ReleaseCommandQueue(cq));
  cl(ReleaseCommandQueue(cq_profile));

  cl(ReleaseContext(context));

  return 0;
}
Example #18
0
int main(void) {
  cl_context context = 0;
  cl_command_queue command_waiting_line = 0;
  cl_program program = 0;
  cl_device_id device_id = 0;
  cl_kernel kernel = 0;
  // int numberOfMemoryObjects = 3;
  cl_mem memoryObjects[3] = {0, 0, 0};
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_devices;
  cl_int errorNumber;
  cl_int ret;
  /* Load the source code containing the kernel*/
  char fileName[] = "source/parallel/composition_population.cl";
  FILE *fp;
  char *source_str;
  size_t source_size;
  fp = fopen(fileName, "r");
  cl_uint ret_num_platforms;
  if (!fp) {
    fprintf(stderr, "Failed to load kernel %s:%d.\n", __FILE__, __LINE__);
    exit(1);
  }
  source_str = (char *)malloc(MAX_SOURCE_SIZE);
  source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  fclose(fp);

  // printf("file: %s :file", source_str);

  getInfo();

  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to get platform id's. %s:%d\n", __FILE__, __LINE__);
    return 1;
  }
  ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
                       &ret_num_devices);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to get OpenCL devices. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create an OpenCL context. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

#ifdef CL_VERSION_2_0
  command_waiting_line =
      clCreateCommandQueueWithProperties(context, device_id, 0, &ret);
#else
  command_waiting_line = clCreateCommandQueue(context, device_id, 0, &ret);
#endif

  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create the OpenCL command queue. %s:%d\n",
            __FILE__, __LINE__);
    return 1;
  }

  /* create program */

  program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
                                      (const size_t *)&source_size, &ret);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create OpenCL program. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }
  /* Build Kernel Program */
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
  if (!success_verification(ret)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to build OpenCL program. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  kernel = clCreateKernel(program, "composition_population", &errorNumber);
  if (!success_verification(errorNumber)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create OpenCL kernel. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  /* [Setup memory] */
  /* Number of elements in the arrays of input and output data. */

  /* The buffers are the size of the arrays. */
  uint16_t activity_atom_size = MAX_INDEPENDENTCLAUSE_TABLET * 1;
  uint8_t program_size = 1;
  uint8_t population_size = 4;
  size_t activity_atom_byte_size = activity_atom_size * sizeof(v16us);
  uint16_t population_byte_size =
      (uint16_t)(program_size * (uint16_t)(population_size * sizeof(v16us)));

  /*
   * Ask the OpenCL implementation to allocate buffers for the data.
   * We ask the OpenCL implemenation to allocate memory rather than allocating
   * it on the CPU to avoid having to copy the data later.
   * The read/write flags relate to accesses to the memory from within the
   * kernel.
   */
  int createMemoryObjectsSuccess = TRUE;

  memoryObjects[0] =
      clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                     activity_atom_byte_size, NULL, &errorNumber);
  createMemoryObjectsSuccess &= success_verification(errorNumber);

  memoryObjects[1] =
      clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                     population_byte_size, NULL, &errorNumber);
  createMemoryObjectsSuccess &= success_verification(errorNumber);

  memoryObjects[2] =
      clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                     newspaper_byte_size, NULL, &errorNumber);
  createMemoryObjectsSuccess &= success_verification(errorNumber);

  if (!createMemoryObjectsSuccess) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to create OpenCL buffer. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }
  /* [Setup memory] */

  /* [Map the buffers to pointers] */
  /* Map the memory buffers created by the OpenCL implementation to pointers so
   * we can access them on the CPU. */
  int mapMemoryObjectsSuccess = TRUE;

  v16us *activity_atom = (v16us *)clEnqueueMapBuffer(
      command_waiting_line, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0,
      activity_atom_byte_size, 0, NULL, NULL, &errorNumber);
  mapMemoryObjectsSuccess &= success_verification(errorNumber);

  // cl_int *inputB = (cl_int *)clEnqueueMapBuffer(
  //    command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0,
  //    bufferSize, 0,
  //    NULL, NULL, &errorNumber);
  // mapMemoryObjectsSuccess &= success_verification(errorNumber);

  if (!mapMemoryObjectsSuccess) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__);
    return 1;
  }
  /* [Map the buffers to pointers] */

  /* [Initialize the input data] */

  const char *activity_atom_text = "nyistu htoftu hnattu hnamtu";
  const uint16_t activity_atom_text_size =
      (uint16_t)(strlen(activity_atom_text));
  const char *quiz_independentClause_list_text =
      "zrundoka hwindocayu hwindokali"
      "hwindoka tyutdocayu tyindokali"
      "tyutdoka tyutdocayu hfutdokali"
      "tyindoka fwandocayu nyatdokali";
  //"bu.hnac.2.hnac.buka bu.hnac.2.hnac.buca yu "
  //"bu.hnac.4.hnac.bukali";
  const uint16_t quiz_independentClause_list_text_size =
      (uint16_t)strlen(quiz_independentClause_list_text);
  uint16_t quiz_independentClause_list_size = 4;
  v16us quiz_independentClause_list[8];
  uint16_t text_remainder = 0;
  // uint16_t program_worth = 0;
  uint64_t random_seed = 0x0123456789ABCDEF;
  uint16_t tablet_indexFinger = 0;
  // uint8_t champion = 0;
  // uint16_t champion_worth = 0;
  // v16us program_;
  // v16us population[4];
  memset(quiz_independentClause_list, 0,
         (size_t)(quiz_independentClause_list_size * TABLET_LONG * WORD_THICK));
  text_code(activity_atom_text_size, activity_atom_text, &activity_atom_size,
            activity_atom, &text_remainder);
  assert(text_remainder == 0);
  text_code(quiz_independentClause_list_text_size,
            quiz_independentClause_list_text, &quiz_independentClause_list_size,
            quiz_independentClause_list, &text_remainder);
  /* [Initialize the input data] */

  /* [Un-map the buffers] */
  /*
   * Unmap the memory objects as we have finished using them from the CPU side.
   * We unmap the memory because otherwise:
   * - reads and writes to that memory from inside a kernel on the OpenCL side
   * are undefined.
   * - the OpenCL implementation cannot free the memory when it is finished.
   */
  if (!success_verification(
          clEnqueueUnmapMemObject(command_waiting_line, memoryObjects[0],
                                  activity_atom, 0, NULL, NULL))) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  // if (!success_verification(clEnqueueUnmapMemObject(command_waiting_line,
  // memoryObjects[1],
  //                                          inputB, 0, NULL, NULL))) {
  //  cleanUpOpenCL(context, command_waiting_line, program, kernel,
  //  memoryObjects,
  //                numberOfMemoryObjects);
  //  cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__
  //       << endl;
  //  return 1;
  //}
  /* [Un-map the buffers] */

  /* [Set the kernel arguments] */
  int setKernelArgumentsSuccess = TRUE;
  printf("arg0\n");
  setKernelArgumentsSuccess &= success_verification(clSetKernelArg(
      kernel, 0, sizeof(uint8_t), (uint8_t *)&activity_atom_size));
  printf("arg1\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[0]));
  printf("arg2\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 2, sizeof(uint16_t), (uint16_t *)&program_size));
  printf("arg3\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 3, sizeof(uint8_t), (uint8_t *)&population_size));
  printf("arg4\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 4, sizeof(uint64_t), (uint64_t *)&random_seed));
  printf("arg5\n");
  setKernelArgumentsSuccess &=
      success_verification(clSetKernelArg(kernel, 5, sizeof(uint64_t *), NULL));
  printf("arg6\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 6, sizeof(cl_mem), &memoryObjects[1]));
  printf("arg7\n");
  setKernelArgumentsSuccess &=
      success_verification(clSetKernelArg(kernel, 7, sizeof(uint8_t *), NULL));
  printf("arg8\n");
  setKernelArgumentsSuccess &= success_verification(
      clSetKernelArg(kernel, 8, sizeof(cl_mem), &memoryObjects[2]));

  if (!setKernelArgumentsSuccess) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed setting OpenCL kernel arguments. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }
  /* [Set the kernel arguments] */

  /* An event to associate with the Kernel. Allows us to retrieve profiling
   * information later. */
  cl_event event = 0;

  /* [Global work size] */
  /*
   * Each instance of our OpenCL kernel operates on a single element of each
   * array so the number of
   * instances needed is the number of elements in the array.
   */
  size_t globalWorksize[1] = {population_size};
  size_t localWorksize[1] = {2};
  /* Enqueue the kernel */
  if (!success_verification(clEnqueueNDRangeKernel(
          command_waiting_line, kernel, 1, NULL, globalWorksize, localWorksize,
          0, NULL, &event))) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed enqueuing the kernel. %s:%d\n", __FILE__, __LINE__);
    return 1;
  }
  /* [Global work size] */

  /* Wait for kernel execution completion. */
  if (!success_verification(clFinish(command_waiting_line))) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed waiting for kernel execution to finish. %s:%d\n",
            __FILE__, __LINE__);
    return 1;
  }

  /* Print the profiling information for the event. */
  // printProfilingInfo(event);
  /* Release the event object. */
  if (!success_verification(clReleaseEvent(event))) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed releasing the event object. %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  /* Get a pointer to the output data. */
  printf("clOut\n");
  v16us *output = (v16us *)clEnqueueMapBuffer(
      command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_READ, 0,
      population_byte_size, 0, NULL, NULL, &errorNumber);
  v16us *newspaper = (v16us *)clEnqueueMapBuffer(
      command_waiting_line, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0,
      newspaper_byte_size, 0, NULL, NULL, &errorNumber);
  if (!success_verification(errorNumber)) {
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__);
    return 1;
  }

  /* [Output the results] */
  /* Uncomment the following block to print results. */
  for (tablet_indexFinger = 0;
       tablet_indexFinger < (population_size * TABLET_LONG);
       ++tablet_indexFinger) {
    if (tablet_indexFinger % 0x10 == 0)
      printf("\n");
    printf("%04X ", (uint)((uint16_t *)output)[tablet_indexFinger]);
  }
  printf("\n");
  // printf("program %04X \n", (uint)*((uint16_t *)&(output[1])));

  printf("newspaper \n");
  for (tablet_indexFinger = 0;
       tablet_indexFinger < (NEWSPAPER_LONG * TABLET_LONG);
       ++tablet_indexFinger) {
    if (tablet_indexFinger % 0x10 == 0)
      printf("\n");
    printf("%04X ", (uint)((uint16_t *)newspaper)[tablet_indexFinger]);
  }
  printf("\n");
  /* [Output the results] */

  /* Unmap the memory object as we are finished using them from the CPU side. */
  if (!success_verification(clEnqueueUnmapMemObject(
          command_waiting_line, memoryObjects[1], output, 0, NULL, NULL))) {
    printf("unmapping\n");
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }
  if (!success_verification(clEnqueueUnmapMemObject(
          command_waiting_line, memoryObjects[2], newspaper, 0, NULL, NULL))) {
    printf("unmapping\n");
    // cleanUpOpenCL(context, command_waiting_line, program, kernel,
    // memoryObjects,
    //              numberOfMemoryObjects);
    fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__,
            __LINE__);
    return 1;
  }

  printf("releasing\n");
  /* Release OpenCL objects. */
  // cleanUpOpenCL(context, command_waiting_line, program, kernel,
  // memoryObjects,
  //              numberOfMemoryObjects);
}