void AdvancedMaxPoolingLayer::BackPropagate() {
#ifdef BUILD_OPENCL_MAX
  input_->delta.MoveToGPU(true);
  output_->delta.MoveToGPU();
  maximum_mask_.MoveToGPU();
  
  cl_uint error = 0;
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 0, sizeof (cl_mem), &input_->delta.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 1, sizeof (cl_mem), &maximum_mask_.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 2, sizeof (cl_mem), &output_->delta.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 3, sizeof (unsigned int), &input_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 4, sizeof (unsigned int), &input_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 5, sizeof (unsigned int), &maps_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 6, sizeof (unsigned int), &output_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 7, sizeof (unsigned int), &output_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 8, sizeof (unsigned int), &region_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 9, sizeof (unsigned int), &region_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 10, sizeof (unsigned int), &stride_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumBackward, 11, sizeof (unsigned int), &stride_height_);
  if (error != CL_SUCCESS) {
    FATAL ("Error setting kernel args: " << (signed int) error);
  }

  size_t global_work_size[] = { input_width_, input_height_, maps_* input_->data.samples() };

  error = clEnqueueNDRangeKernel (CLHelper::queue, CLHelper::k_amaximumBackward, 3, NULL,
                                  global_work_size, NULL, 0, NULL, NULL);
  if (error != CL_SUCCESS) {
    FATAL ("Error enqueueing kernel: " << (signed int) error);
  }

#ifdef BRUTAL_FINISH
  error = clFinish (CLHelper::queue);
  if (error != CL_SUCCESS) {
    FATAL ("Error finishing command queue: " << (signed int) error);
  }
#endif

#else
  
#define MP_HELPER_MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
  
#pragma omp parallel for default(shared)
  for(std::size_t sample = 0; sample < input_->data.samples(); sample++) {
    for (unsigned int map = 0; map < maps_; map++) {
      for (unsigned int ix = 0; ix < input_width_; ix++) {
        for(unsigned int iy = 0; iy < input_width_; iy++) {
          const unsigned int mask_index = ix + input_width_ * iy;
          const unsigned int oxstart = (ix < region_width_) ? 
            0 : (ix - region_width_) / stride_width_+ 1;
          const unsigned int oxend = MP_HELPER_MIN(ix / stride_width_ + 1, output_width_);
          
          const unsigned int oystart = (iy < region_height_) ? 
            0 : (iy - region_height_) / stride_height_ + 1;
          const unsigned int oyend = MP_HELPER_MIN(iy / stride_height_ + 1, output_height_);
          
          datum sum = 0.0;
          for (unsigned int oy = oystart; oy < oyend; oy++) {
            for (unsigned int ox = oxstart; ox < oxend; ox++) {
              if(*maximum_mask_.data_ptr_const(ox, oy, map, sample) == mask_index)
                sum += *output_->delta.data_ptr_const(ox, oy, map, sample);
            }
          }
          *(input_->delta.data_ptr(ix, iy, map, sample)) = sum;
        }
      }
    }
  }
#endif
}
Esempio n. 2
0
int task(cl_context context, cl_device_id device, cl_command_queue queue, void* data_)
{
  const TaskData* data = (const TaskData*) data_;
  cl_int err;

  if (data->points % data->points_per_work_item)
    check_error(CLQMC_INVALID_VALUE, "points must be a multiple of points_per_work_item");

  if (data->replications % data->replications_per_work_item)
    check_error(CLQMC_INVALID_VALUE, "replications must be a multiple of replications_per_work_item");


  // Lattice buffer

  size_t pointset_size;
  // gen_vec is given in common.c
  clqmcLatticeRule* pointset = clqmcLatticeRuleCreate(data->points, DIMENSION, gen_vec, &pointset_size, &err);
  check_error(err, NULL);

  cl_mem pointset_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
      pointset_size, pointset, &err);
  check_error(err, "cannot create point set buffer");


  // Shifts buffer
  
  clqmc_fptype* shifts = (clqmc_fptype*) malloc(data->replications * DIMENSION * sizeof(clqmc_fptype));

  // populate random shifts using a random stream
  clrngMrg31k3pStream* stream = clrngMrg31k3pCreateStreams(NULL, 1, NULL, &err);
  check_error(err, NULL);
  for (cl_uint i = 0; i < data->replications; i++)
      for (cl_uint j = 0; j < DIMENSION; j++)
          shifts[i * DIMENSION + j] = clrngMrg31k3pRandomU01(stream);
  err = clrngMrg31k3pDestroyStreams(stream);
  check_error(err, NULL);

  cl_mem shifts_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
      data->replications * DIMENSION * sizeof(clqmc_fptype), shifts, &err);
  check_error(err, "cannot create shifts buffer");


  // Output buffer

  size_t points_block_count = data->points / data->points_per_work_item;
  cl_mem output_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, 
      data->replications * points_block_count * sizeof(clqmc_fptype), NULL, &err);
  check_error(err, "cannot create output buffer");


  // OpenCL kernel

  cl_program program = build_program_from_file(context, device,
      "client/DocsTutorial/example4_kernel.cl",
      NULL);
  check_error(err, NULL);
  cl_kernel kernel = clCreateKernel(program, "simulateWithRQMC", &err);
  check_error(err, "cannot create kernel");

  int iarg = 0;
  err  = clSetKernelArg(kernel, iarg++, sizeof(pointset_buf), &pointset_buf);
  err |= clSetKernelArg(kernel, iarg++, sizeof(shifts_buf), &shifts_buf);
  err |= clSetKernelArg(kernel, iarg++, sizeof(data->points_per_work_item), &data->points_per_work_item);
  err |= clSetKernelArg(kernel, iarg++, sizeof(data->replications), &data->replications);
  err |= clSetKernelArg(kernel, iarg++, sizeof(output_buf), &output_buf);
  check_error(err, "cannot set kernel arguments");


  // Execution

  cl_event ev;
  size_t global_size = (data->replications / data->replications_per_work_item) * points_block_count;
  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &ev);
  check_error(err, "cannot enqueue kernel");

  err = clWaitForEvents(1, &ev);
  check_error(err, "error waiting for events");

  clqmc_fptype* output = (clqmc_fptype*) malloc(data->replications * points_block_count * sizeof(clqmc_fptype));
  err = clEnqueueReadBuffer(queue, output_buf, CL_TRUE, 0,
      data->replications * points_block_count * sizeof(clqmc_fptype), output, 0, NULL, NULL);
  check_error(err, "cannot read output buffer");

  printf("\nAdvanced randomized quasi-Monte Carlo integration:\n\n");

  err = clqmcLatticeRuleWriteInfo(pointset, stdout);
  check_error(err, NULL);
  printf("\n");

  rqmcReport(data->replications, data->points, points_block_count, output);


  // Clean up

  clReleaseEvent(ev);
  clReleaseMemObject(output_buf);
  clReleaseMemObject(pointset_buf);
  clReleaseKernel(kernel);
  clReleaseProgram(program);

  free(output);
  err = clqmcLatticeRuleDestroy(pointset);
  check_error(err, NULL);

  return EXIT_SUCCESS;
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'relational_greater_than_or_equal_to_ulong16ulong16.cl' */
        source_code = read_buffer("relational_greater_than_or_equal_to_ulong16ulong16.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "relational_greater_than_or_equal_to_ulong16ulong16", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_ulong16 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong16));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_ulong16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong16), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create and init host side src buffer 1 */
        cl_ulong16 *src_1_host_buffer;
        src_1_host_buffer = malloc(num_elem * sizeof(cl_ulong16));
        for (int i = 0; i < num_elem; i++)
                src_1_host_buffer[i] = (cl_ulong16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong16), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_int16 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_int16));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Free host side src buffer 1 */
        free(src_1_host_buffer);

        /* Free device side src buffer 1 */
        ret = clReleaseMemObject(src_1_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Esempio n. 4
0
static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
				int64_t __maybe_unused max_nonce)
{
	const int thr_id = thr->id;
	struct opencl_thread_data *thrdata = thr->cgpu_data;
	struct cgpu_info *gpu = thr->cgpu;
	_clState *clState = clStates[thr_id];
	const cl_kernel *kernel = &clState->kernel;
	const int dynamic_us = opt_dynamic_interval * 1000;

	cl_int status;
	size_t globalThreads[1];
	size_t localThreads[1] = { clState->wsize };
	int64_t hashes;

	/* Windows' timer resolution is only 15ms so oversample 5x */
	if (gpu->dynamic && (++gpu->intervals * dynamic_us) > 70000) {
		struct timeval tv_gpuend;
		double gpu_us;

		gettimeofday(&tv_gpuend, NULL);
		gpu_us = us_tdiff(&tv_gpuend, &gpu->tv_gpustart) / gpu->intervals;
		if (gpu_us > dynamic_us) {
			if (gpu->intensity > MIN_INTENSITY)
				--gpu->intensity;
		} else if (gpu_us < dynamic_us / 2) {
			if (gpu->intensity < MAX_INTENSITY)
				++gpu->intensity;
		}
		memcpy(&(gpu->tv_gpustart), &tv_gpuend, sizeof(struct timeval));
		gpu->intervals = 0;
	}

	set_threads_hashes(clState->vwidth, &hashes, globalThreads, localThreads[0], &gpu->intensity);
	if (hashes > gpu->max_hashes)
		gpu->max_hashes = hashes;

	status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
		return -1;
	}

	if (clState->goffset) {
		size_t global_work_offset[1];

		global_work_offset[0] = work->blk.nonce;
		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset,
						globalThreads, localThreads, 0,  NULL, NULL);
	} else
		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
						globalThreads, localThreads, 0,  NULL, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
		return -1;
	}

	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
			BUFFERSIZE, thrdata->res, 0, NULL, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status);
		return -1;
	}

	/* The amount of work scanned can fluctuate when intensity changes
	 * and since we do this one cycle behind, we increment the work more
	 * than enough to prevent repeating work */
	work->blk.nonce += gpu->max_hashes;

	/* This finish flushes the readbuffer set with CL_FALSE in clEnqueueReadBuffer */
	clFinish(clState->commandQueue);

	/* FOUND entry is used as a counter to say how many nonces exist */
	if (thrdata->res[FOUND]) {
		/* Clear the buffer again */
		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
				BUFFERSIZE, blank_res, 0, NULL, NULL);
		if (unlikely(status != CL_SUCCESS)) {
			applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
			return -1;
		}
		applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
		postcalc_hash_async(thr, work, thrdata->res);
		memset(thrdata->res, 0, BUFFERSIZE);
		/* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */
		clFinish(clState->commandQueue);
	}

	return hashes;
}
void 
kernel_gpu_opencl_wrapper_2(knode *knodes,
							long knodes_elem,
							long knodes_mem,

							int order,
							long maxheight,
							int count,

							long *currKnode,
							long *offset,
							long *lastKnode,
							long *offset_2,
							int *start,
							int *end,
							int *recstart,
							int *reclength)
{

	//======================================================================================================================================================150
	//	CPU VARIABLES
	//======================================================================================================================================================150

	// timer
	long long time0;
	long long time1;
	long long time2;
	long long time3;
	long long time4;
	long long time5;
	long long time6;

	time0 = get_time();

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	// cudaThreadSynchronize();

	//====================================================================================================100
	//	COMMON VARIABLES
	//====================================================================================================100

	// common variables
	cl_int error;

	//====================================================================================================100
	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
	//====================================================================================================100

	// Get the number of available platforms
	cl_uint num_platforms;
	error = clGetPlatformIDs(	0, 
								NULL, 
								&num_platforms);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of available platforms
	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(	num_platforms, 
								platforms, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the 1st platform
	cl_platform_id platform = platforms[0];

	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
	char pbuf[100];
	error = clGetPlatformInfo(	platform, 
								CL_PLATFORM_VENDOR, 
								sizeof(pbuf), 
								pbuf, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE CONTEXT FOR THE PLATFORM
	//====================================================================================================100

	// Create context properties for selected platform
	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
													(cl_context_properties) platform, 
													0};

	// Create context for selected platform being GPU
	cl_context context;
	context = clCreateContextFromType(	context_properties, 
										CL_DEVICE_TYPE_GPU, 
										NULL, 
										NULL, 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
	//====================================================================================================100

	// Get the number of devices (previousely selected for the context)
	size_t devices_size;
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								0, 
								NULL, 
								&devices_size);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of devices (previousely selected for the context)
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								devices_size, 
								devices, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
	cl_device_id device;
	device = devices[0];

	// Get the name of the selected device (previousely selected for the context) and print it
	error = clGetDeviceInfo(device, 
							CL_DEVICE_NAME, 
							sizeof(pbuf), 
							pbuf, 
							NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE COMMAND QUEUE FOR THE DEVICE
	//====================================================================================================100

	// Create a command queue
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(	context, 
											device, 
											0, 
											&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CREATE PROGRAM, COMPILE IT
	//====================================================================================================100

	// Load kernel source code from file
	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl_2.cl");
	size_t sourceSize = strlen(source);

	// Create the program
	cl_program program = clCreateProgramWithSource(	context, 
													1, 
													&source, 
													&sourceSize, 
													&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	char clOptions[110];
	//  sprintf(clOptions,"-I../../src");                                                                                 
	sprintf(clOptions,"-I./../");

#ifdef DEFAULT_ORDER_2
	sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER_2=%d", DEFAULT_ORDER_2);
#endif

	// Compile the program
	error = clBuildProgram(	program, 
							1, 
							&device, 
							clOptions, 
							NULL, 
							NULL);
	// Print warnings and errors from compilation
	static char log[65536]; 
	memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(	program, 
							device, 
							CL_PROGRAM_BUILD_LOG, 
							sizeof(log)-1, 
							log, 
							NULL);
	printf("-----OpenCL Compiler Output-----\n");
	if (strstr(log,"warning:") || strstr(log, "error:")) 
		printf("<<<<\n%s\n>>>>\n", log);
	printf("--------------------------------\n");
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Create kernel
	cl_kernel kernel;
	kernel = clCreateKernel(program, 
							"findRangeK", 
							&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time1 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY				MALLOC
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN
	//====================================================================================================100

	//==================================================50
	//	knodesD
	//==================================================50

	cl_mem knodesD;
	knodesD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								knodes_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	currKnodeD
	//==================================================50

	cl_mem currKnodeD;
	currKnodeD = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offsetD
	//==================================================50

	cl_mem offsetD;
	offsetD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	lastKnodeD
	//==================================================50

	cl_mem lastKnodeD;
	lastKnodeD = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offset_2D
	//==================================================50

	cl_mem offset_2D;
	offset_2D = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	startD
	//==================================================50

	cl_mem startD;
	startD = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(int), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	endD
	//==================================================50

	cl_mem endD;
	endD = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							count*sizeof(int), 
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	cl_mem ansDStart;
	ansDStart = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							count*sizeof(int), 
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	ansDLength
	//==================================================50

	cl_mem ansDLength;
	ansDLength = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							count*sizeof(int), 
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time2 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN
	//====================================================================================================100

	//==================================================50
	//	knodesD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									knodesD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									knodes_mem,				// size to be copied
									knodes,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	currKnodeD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									currKnodeD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									currKnode,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offsetD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									offsetD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									offset,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	lastKnodeD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									lastKnodeD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									lastKnode,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offset_2D
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									offset_2D,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									offset_2,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	startD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									startD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									start,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	endD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									endD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									end,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									endD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									end,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	ansDLength
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									ansDLength,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									reclength,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time3 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//======================================================================================================================================================150
	//	KERNEL
	//======================================================================================================================================================150

	//====================================================================================================100
	//	Execution Parameters
	//====================================================================================================100

	size_t local_work_size[1];
	local_work_size[0] = order < 1024 ? order : 1024;
	size_t global_work_size[1];
	global_work_size[0] = count * local_work_size[0];

	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);

	//====================================================================================================100
	//	Kernel Arguments
	//====================================================================================================100

	clSetKernelArg(	kernel, 
					0, 
					sizeof(long), 
					(void *) &maxheight);
	clSetKernelArg(	kernel, 
					1, 
					sizeof(cl_mem), 
					(void *) &knodesD);
	clSetKernelArg(	kernel, 
					2, 
					sizeof(long), 
					(void *) &knodes_elem);

	clSetKernelArg(	kernel, 
					3, 
					sizeof(cl_mem), 
					(void *) &currKnodeD);
	clSetKernelArg(	kernel, 
					4, 
					sizeof(cl_mem), 
					(void *) &offsetD);
	clSetKernelArg(	kernel, 
					5, 
					sizeof(cl_mem), 
					(void *) &lastKnodeD);
	clSetKernelArg(	kernel, 
					6, 
					sizeof(cl_mem), 
					(void *) &offset_2D);
	clSetKernelArg(	kernel, 
					7, 
					sizeof(cl_mem), 
					(void *) &startD);
	clSetKernelArg(	kernel, 
					8, 
					sizeof(cl_mem), 
					(void *) &endD);
	clSetKernelArg(	kernel, 
					9, 
					sizeof(cl_mem), 
					(void *) &ansDStart);
	clSetKernelArg(	kernel, 
					10, 
					sizeof(cl_mem), 
					(void *) &ansDLength);

	//====================================================================================================100
	//	Kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
	error = clFinish(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time4 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY (CONTD.)
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	error = clEnqueueReadBuffer(command_queue,				// The command queue.
								ansDStart,					// The image on the device.
								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
								0,							// Offset. None in this case.
								count*sizeof(int),			// Size to copy.
								recstart,					// The pointer to the image on the host.
								0,							// Number of events in wait list. Not used.
								NULL,						// Event wait list. Not used.
								NULL);						// Event object for determining status. Not used.
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	ansDLength
	//==================================================50

	error = clEnqueueReadBuffer(command_queue,				// The command queue.
								ansDLength,					// The image on the device.
								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
								0,							// Offset. None in this case.
								count*sizeof(int),			// Size to copy.
								reclength,					// The pointer to the image on the host.
								0,							// Number of events in wait list. Not used.
								NULL,						// Event wait list. Not used.
								NULL);						// Event object for determining status. Not used.
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time5 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY DEALLOCATION
	//======================================================================================================================================================150

	// Release kernels...
	clReleaseKernel(kernel);

	// Now the program...
	clReleaseProgram(program);

	// Clean up the device memory...
	clReleaseMemObject(knodesD);

	clReleaseMemObject(currKnodeD);
	clReleaseMemObject(offsetD);
	clReleaseMemObject(lastKnodeD);
	clReleaseMemObject(offset_2D);
	clReleaseMemObject(startD);
	clReleaseMemObject(endD);
	clReleaseMemObject(ansDStart);
	clReleaseMemObject(ansDLength);

	// Flush the queue
	error = clFlush(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// ...and finally, the queue and context.
	clReleaseCommandQueue(command_queue);

	// ???
	clReleaseContext(context);

	time6 = get_time();

	//======================================================================================================================================================150
	//	DISPLAY TIMING
	//======================================================================================================================================================150

	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");

	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);

	printf("Total time:\n");
	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);

	//======================================================================================================================================================150
	//	END
	//======================================================================================================================================================150

}
Esempio n. 6
0
void OpenCLExecuter::ocl_filter_shared(void)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	cl_sampler sampler;					// OpenCL sampler
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process

	// set Local work size dimensions
//	size_t local_threads[3] ={256,256,64};
	// set Global work size dimensions
//	size_t global_threads[3] ={roundup((int) volobj->texwidth/local_threads[0], 0)*local_threads[0], roundup((int) volobj->texheight/local_threads[1], 0)*local_threads[1], roundup((int) volobj->texdepth/local_threads[2], 0)*local_threads[2]};

	// set Global work size dimensions
	size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};

	// allocate the source buffer memory object
	src_buffer = clCreateFromGLTexture3D (ocl_wrapper->context, CL_MEM_READ_WRITE, GL_TEXTURE_3D, 0, volobj->TEXTURE3D_RED, &err);
	printf("OPENCL: clCreateFromGLTexture3D: %s\n", ocl_wrapper->get_error(err));

	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

	// create a sampler object
	sampler = clCreateSampler(ocl_wrapper->context, CL_FALSE, CL_ADDRESS_CLAMP, CL_FILTER_NEAREST, &err);
	printf("OPENCL: clCreateSampler: %s\n", ocl_wrapper->get_error(err));
 
    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(sampler), (void*)&sampler);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);

	// grab input data from OpenGL, compute, copy the results back to OpenGL
	// Runs asynchronous to host, up until blocking clFinish at the end

	glFinish();
	glFlush();
	
	// grab the OpenGL texture object for read/writing from OpenCL
	err = clEnqueueAcquireGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,NULL,NULL);
	printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err));

	// Execute a kernel
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	/*
	// Blocking read of results from GPU to Host
	int size = volobj->texwidth*volobj->texheight*volobj->texdepth;
	unsigned char* result = new unsigned char[size];
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, result, 0, NULL, NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
	for(int i=0; i<size; i++) volobj->texture3d[3*i+0] = result[i];
	delete[] result;
	*/

	// copy OpenCL buffer to OpenGl texture
	size_t corigin[3] = {0,0,0};
	size_t cdimensions[3] = {(unsigned int)volobj->texwidth, (unsigned int)volobj->texheight, (unsigned int)volobj->texdepth};
	err = clEnqueueCopyBufferToImage(ocl_wrapper->commandQue , dst_buffer, src_buffer, 0, corigin, cdimensions, 0, NULL, NULL);
	printf("OPENCL: clEnqueueCopyBufferToImage: %s\n", ocl_wrapper->get_error(err));

	//make sure we block until we are done.
	//err = clFinish(ocl_wrapper->commandQue);
	//printf("OPENCL: clFinish: %s\n", ocl_wrapper->get_error(err));
	
	//release opengl objects now
	err = clEnqueueReleaseGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,0,0);
	printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(src_buffer)clReleaseMemObject(src_buffer);
    if(dst_buffer)clReleaseMemObject(dst_buffer);
}
Esempio n. 7
0
void OpenCLExecuter::ocl_parrallelReduction(void)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem tmp_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	size_t numWorkGroups;
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = 65536; //65536 // Length of float arrays to process

	// set Local work size dimensions
	szLocalWorkSize = 512;
	// set Global work size dimensions
	szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize;  
	//szGlobalWorkSize = iNumElements;
	numWorkGroups = (float)szGlobalWorkSize/(float)szLocalWorkSize;
	printf("OPENCL: number of elements: %d\n", (int)iNumElements);
	printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize);
	printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize);
	printf("OPENCL: work groups: %d\n", (int)(numWorkGroups));
	
	//temp array
	int* data = new int[iNumElements];

	for(int i=0; i<iNumElements; i++)
		data[i] = randomFloat(1.0, (float)iNumElements);

	data[iNumElements/2] = -100.0;

	//for(int i=0; i<iNumElements; i++)
	//	printf("data: %d\n", data[i]);

	size_t global_threads[1] ={iNumElements};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the temp buffer memory object
	tmp_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
	
	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "min_reduce", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(int)*szLocalWorkSize, NULL);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&iNumElements);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end
	
	int numb_iterations = sqrt((float)numWorkGroups);
	numb_iterations=0;
	bool cont = true;

	Timer timer;
	timer.startTimer();
	//for(int i=0; i<numb_iterations; i++)
	while(cont)
	{
		// Write data from host to GPU
		err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL);
		printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
		// Launch kernel 
		err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
		printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

		// Blocking read of results from GPU to Host
		err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(int) * iNumElements, data, 0, NULL,  NULL);
		printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
		
		numb_iterations++;
		if(data[1]==0) cont = false;

		//printf("min: %d\n", data[0]);
		for(int i=0; i<numWorkGroups; i++)
			printf("min: %d\n", data[i]);
	}
	timer.endTimer("GPU find min");

	timer.startTimer();
	int min=iNumElements;
	for(int i=0; i<iNumElements; i++)
		if(data[i]<min) min = data[i];
	timer.endTimer("CPU find min");

	printf("iters: %d\n", numb_iterations);
	printf("gpu-min: %d\n", data[0]);
	printf("cpu-min: %d\n", min);
	
	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 


    //need to release any other OpenCL memory objects here
    if(dst_buffer)clReleaseMemObject(dst_buffer);
    if(src_buffer)clReleaseMemObject(src_buffer);


//	printf("min: %d\n", data[0]);

	delete[] data;
}
Esempio n. 8
0
int main()
{
	// Initiating opencl
	cl_device_id device_id;
	cl_int err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in device."<<std::endl;
        return EXIT_FAILURE;
    }
    cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  	if (!context)
    {
        std::cout<<"Error in context."<<std::endl;
        return EXIT_FAILURE;
    }
    cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        std::cout<<"Error in command queue."<<std::endl;
        return EXIT_FAILURE;
    }
	std::ifstream in("transpMatrix.cl");
	std::string contents((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());
    const char* kernelSource = contents.c_str();
    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
    if (!program)
    {
        std::cout<<"Error in program."<<std::endl;
        return EXIT_FAILURE;
    }
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        std::cout<<"Error in compiling the opencl program."<<std::endl;
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        std::cout<<buffer<<std::endl;
        return EXIT_FAILURE;
    }
    cl_kernel kernel = clCreateKernel(program, "simplecl", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        std::cout<<"Error in kernel "<<err<<std::endl;
        return EXIT_FAILURE;
    }

    // Data to compute
    float* data =  new float[count*count];

    for(int i = 0; i < count; ++i)
    {
        for(int j = 0; j < count; ++j)
        {
            data[i*count+j] = rand()%10;
            std::cout<<data[i*count+j]<<" ";
        }
        std::cout<<std::endl;
    }
    std::cout<<std::endl;
    // Creating communication buffers
    cl_mem input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count*count, NULL, NULL);
    cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count*count, NULL, NULL);

	if (!input || !output)
    {
        std::cout<<"Error in allocation."<<std::endl;
        return EXIT_FAILURE;
    }  

    // Copy data to input buffer
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in copy."<<std::endl;
        return EXIT_FAILURE;
    }

 	err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(int), &count);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in argument."<<std::endl;
        return EXIT_FAILURE;
    }
    size_t local[] = {1,1};
    size_t global[] = {10,10};
    // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    // if (err != CL_SUCCESS)
    // {
    //     std::cout<<"Error in getting loal."<<std::endl;
    //     return EXIT_FAILURE;
    // }
    err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, global, local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in pushing to queue "<<err<<std::endl;
        return EXIT_FAILURE;
    }
    clFinish(commands);
    // Is done now

    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL );  
    if (err != CL_SUCCESS)
    {
       std::cout<<"Error in reading back."<<std::endl;
       return EXIT_FAILURE;
    }

    for(int i = 0; i < count; ++i)
    {
        for(int j = 0; j < count; ++j)
        {
            std::cout<<data[i*count+j]<<" ";
        }
        std::cout<<std::endl;
    }
    std::cout<<std::endl;
	return 0;
}
Esempio n. 9
0
int main(int argc, char **argv)
{
	cl_platform_id platforms[100];
	cl_uint platforms_n = 0;
	CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

	printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
	for (int i=0; i<platforms_n; i++)
	{
		char buffer[10240];
		printf("  -- %d --\n", i);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
		printf("  PROFILE = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
		printf("  VERSION = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
		printf("  NAME = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
		printf("  VENDOR = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
		printf("  EXTENSIONS = %s\n", buffer);
	}

	cl_device_id devices[100];
	cl_uint devices_n = 0;
	// CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
	CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

	printf("=== %d OpenCL device(s) found on platform:\n", platforms_n);
	for (int i=0; i<devices_n; i++)
	{
		char buffer[10240];
		cl_uint buf_uint;
		cl_ulong buf_ulong;
		printf("  -- %d --\n", i);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_NAME = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VENDOR = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DRIVER_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
		printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
	}

	if (devices_n == 0)
		return 1;

	cl_context context;
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err));

	const char *program_source[] = {
		"__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n",
		"{\n",
		"	int i = get_global_id(0);\n",
		"	dst[i] = src[i] * factor;\n",
		"}\n"
	};

	cl_program program;
	program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err));
	if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) {
		char buffer[10240];
		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
		fprintf(stderr, "CL Compilation failed:\n%s", buffer);
		abort();
	}
	CL_CHECK(clUnloadCompiler());

	cl_mem input_buffer;
	input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	int factor = 2;

	cl_kernel kernel;
	kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err));
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

	cl_command_queue queue;
	queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err));

	for (int i=0; i<NUM_DATA; i++) {
		CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL));
	}

	cl_event kernel_completion;
	size_t global_work_size[1] = { NUM_DATA };
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
	CL_CHECK(clWaitForEvents(1, &kernel_completion));
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA; i++) {
		int data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL));
		printf(" %d", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(input_buffer));
	CL_CHECK(clReleaseMemObject(output_buffer));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
Esempio n. 10
0
int main(void) {
    // se crea los 2 vectores de entrada
    int i;
    const int LIST_SIZE = 1024;
    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = i;
        B[i] = LIST_SIZE - i;
    }
 
    // cargamos el kernel en source_str
    FILE *fp;
    char *source_str;
    size_t source_size;
 
    fp = fopen("vector_add_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );
 
    // obtenemos las plataformas y informacion de los devices
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, 
            &device_id, &ret_num_devices);
 
    // creamos un contexto OpenCL
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
 
    // creamos la cola de comandos
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
    // creamos el buffer de memoria en el device para cada vector
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
 
    // copiamos los vectores A y B a sus respectivas memorias buffer
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
 
    // creamos un programa para el kernel 
    cl_program program = clCreateProgramWithSource(context, 1, 
            (const char **)&source_str, (const size_t *)&source_size, &ret);
 
    // generamos el programa
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
    // creamos el kernel 
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
 
    // establecemos los argumentos del kernel 
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
 
    // ejecutamos el kernel de la lista
    size_t global_item_size = LIST_SIZE; 
    size_t local_item_size = 64; // dividimos los work items en grupos de 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);
 
    // copiamos la memoria buffer C del device hacia la variable local C
    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
 
    // muestra el resultado
    for(i = 0; i < LIST_SIZE; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);
 
    
    free(A);
    free(B);
    free(C);
    return 0;
}
int main(int argc, char **argv)
{  
   printf("start \n");
   int x, y, nsteps, i, j;
   float *u_h;
   double *f_h;  //pointers to host memory	
   int ArraySizeX = 5122;
   int ArraySizeY = 5122;
   double n, ux, uy, uxx, uxy, uyy, usq;
   FILE *fp;	
   size_t size = ArraySizeX*ArraySizeY*sizeof(float);
   size_t size1 = ArraySizeX*ArraySizeY*9*sizeof(double);
   u_h = (float *)calloc(ArraySizeX*ArraySizeY,sizeof(float));
   f_h = (double *)calloc(ArraySizeX*ArraySizeY*9,sizeof(double));
   printf("initialization \n");
    // initialization 
   for( x = 0;x<ArraySizeX;x++){
     for( y =0;y<ArraySizeY;y++){
	// define the macroscopic properties of the initial condition.
     n = 1 + Amp2*exp(-(pow(x-ArraySizeX/2,2)+pow(y-ArraySizeY/2,2))/Width);
     ux = 0;
     uy = 0;		
      // intialize f to be the local equilibrium values	
     uxx = ux*ux;
     uyy = uy*uy;
     uxy = 2*ux*uy;
     usq = uxx+ uyy;
	  
     f_h[x*ArraySizeY*9+y*9] = w1*n*(1-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+1] = w2*n*(1+3*ux+4.5*uxx-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+2] = w2*n*(1-3*ux+4.5*uxx-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+3] = w2*n*(1+3*uy+4.5*uyy-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+4]= w2*n*(1-3*uy+4.5*uyy-1.5*usq); 
     f_h[x*ArraySizeY*9+y*9+5] = w3*n*(1+3*(ux+uy)+4.5*(uxx+uxy+uyy)-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+6] = w3*n*(1+3*(-ux+uy)+4.5*(uxx-uxy+uyy)-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+7] = w3*n*(1+3*(-ux-uy)+4.5*(uxx+uxy+uyy)-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+8] = w3*n*(1+3*(ux-uy)+4.5*(uxx-uxy+uyy)-1.5*usq);
	}
    }
    
     cl_event event;
     cl_ulong time_start, time_end, total_time; 
     // use this to check the output of each API call
     cl_int status;
     // retrieve the number of platforms
     cl_uint numPlatforms = 0;
     status = clGetPlatformIDs(0,NULL,&numPlatforms);
     chk(status, "clGetPlatformIDs0");

     // allocate enough space for each platform
     cl_platform_id *platforms = NULL;
     platforms = (cl_platform_id *) malloc(numPlatforms*sizeof(cl_platform_id));

     // Fill in the platforms
     status = clGetPlatformIDs(numPlatforms, platforms, NULL);    
     chk(status, "clGetPlatformIDs1");

     // Retrieve the number of devices
     cl_uint numDevices = 0;
     status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
     chk(status, "clGetDeviceIDs0");
  
     // Allocate enough space for each device
     cl_device_id *devices = NULL;
     devices = (cl_device_id *) malloc(numDevices*sizeof(cl_device_id));

     // Fill in the devices
     status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
     chk(status, "clGetDeviceIDs1");

     // Create a context and associate it with devices
     cl_context	 context;
     context = clCreateContext(NULL,numDevices, devices, NULL, NULL, &status);
     chk(status,"clCreateContext");

     // Create  a command queue and associate it with device
     cl_command_queue cmdQueue;
     cmdQueue = clCreateCommandQueue(context, devices[0],CL_QUEUE_PROFILING_ENABLE,&status);
     chk(status,"clCreateCommandQueue");
     
     // Create Buffer objects on devices
     cl_mem u_d, f_d;
     u_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
     chk(status,"clCreatebuffer");
     f_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size1, NULL, &status);
     chk(status, "clCreatebuffer");

     // perform computing on GPU
     // copy data from host to device
     status = clEnqueueWriteBuffer(cmdQueue, u_d, CL_FALSE, 0, size, u_h, 0, NULL, NULL);
     chk(status,"ClEnqueueWriteBuffer");
     status = clEnqueueWriteBuffer(cmdQueue, f_d, CL_FALSE, 0, size1, f_h, 0, NULL, NULL);
     chk(status, "clEnqueueWriteBuffer");
     
     // create program with source code
     cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource, NULL, &status);
     chk(status, "clCreateProgramWithSource");

     // Compile program for the device
     status = clBuildProgram(program, numDevices, devices, NULL, NULL,NULL);
      // chk(status, "ClBuildProgram");
      if(status != CL_SUCCESS){
      printf("clBuildProgram failed (%d) \n", status);
      size_t log_size;
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
      
      char *log = (char *) malloc(log_size);
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
      printf("%s\n", log);
      exit(-1);
     }
      printf("successfully built program \n");
      
     // Create lattice-boltzman kernel
     cl_kernel kernel, kernel1;
     kernel = clCreateKernel(program, "lbiteration", &status);
     kernel1 = clCreateKernel(program, "Denrho", &status);
     chk(status, "clCreateKernel");
      printf("successfully create kernel \n");
     
     // Associate the input and output buffers with the kernel
     status = clSetKernelArg(kernel,0, sizeof(cl_mem), &f_d);
     status |= clSetKernelArg(kernel1,0, sizeof(cl_mem), &u_d);
     status |= clSetKernelArg(kernel1,1, sizeof(cl_mem), &f_d);
     status |= clSetKernelArg(kernel, 1, sizeof(int), &ArraySizeX);
     status |= clSetKernelArg(kernel1,2, sizeof(int), &ArraySizeX);
     status |= clSetKernelArg(kernel, 2, sizeof(int), &ArraySizeY);
     status |= clSetKernelArg(kernel1,3, sizeof(int),&ArraySizeY);
     chk(status, "clSerKernelArg");
    
     // set the work dimensions
     size_t localworksize[2] = {BLOCK_SIZE_X,BLOCK_SIZE_Y};
     int nBLOCKSX = (ArraySizeX-2)/(BLOCK_SIZE_X -2);
     int nBLOCKSY = (ArraySizeY-2)/(BLOCK_SIZE_Y -2);
     size_t globalworksize[2] = {nBLOCKSX*BLOCK_SIZE_X,nBLOCKSY*BLOCK_SIZE_Y};

     // loop the kernel
     for( nsteps = 0; nsteps < 100; nsteps++){
     status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalworksize,localworksize,0,NULL,&event);
     clWaitForEvents(1 , &event);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
     total_time += time_end - time_start;
     }
     printf("Good so far \n");
     status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 2, NULL, globalworksize,localworksize,0,NULL,&event);
     chk(status, "clEnqueueNDR");
     clWaitForEvents(1 , &event);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
     total_time += time_end - time_start;
     printf("running time is %0.3f \n",(total_time/1000000000.0));
     // retrieve data from device
     status = clEnqueueReadBuffer(cmdQueue, u_d, CL_TRUE, 0, size, u_h, 0, NULL, NULL);
     chk(status, "clEnqueueReadBuffer");

     // Output results
     fp = fopen("SolutionCL.txt", "wt");
     for(i= 0;i<ArraySizeX;i++){
       for(j=0;j<ArraySizeY;j++)
         fprintf(fp, " %f", u_h[i*ArraySizeY+j]);
        fprintf(fp, "\n");
     } 
     fclose(fp);

     //cleanup
     clReleaseKernel(kernel);
     clReleaseKernel(kernel1);
     clReleaseProgram(program);
     clReleaseCommandQueue(cmdQueue);
     clReleaseMemObject(u_d);
     clReleaseMemObject(f_d);
     clReleaseContext(context);

     free(u_h);
     free(f_h);
     free(platforms);
     free(devices);
     
     return 0;
}
Esempio n. 12
0
File: hw2.c Progetto: hemantjp/HW2
int
main(int argc, char** argv)
{


   srand(1000);
   int i;

   unsigned int size_A = WA * HA;
   unsigned int mem_size_A = sizeof(float) * size_A;
   float* h_A = (float*) malloc(mem_size_A);

   unsigned int size_B = WB * HB;
   unsigned int mem_size_B = sizeof(float) * size_B;
   float* h_B = (float*) malloc(mem_size_B);


   randomInit(h_A, size_A);
   randomInit(h_B, size_B);


   unsigned int size_C = WC * HC;
   unsigned int mem_size_C = sizeof(float) * size_C;
   float* h_C = (float*) malloc(mem_size_C);

   cl_context clGPUContext;
   cl_command_queue clCommandQue;
   cl_program clProgram;
   cl_kernel clKernel;
   cl_event mm;

   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;


   cl_mem d_A;
   cl_mem d_B;
   cl_mem d_C;


   clGPUContext = clCreateContextFromType(0,
                   CL_DEVICE_TYPE_GPU,
                   NULL, NULL, &errcode);



   errcode = clGetContextInfo(clGPUContext,
              CL_CONTEXT_DEVICES, 0, NULL,
              &dataBytes);
   cl_device_id *clDevices = (cl_device_id *)
              malloc(dataBytes);
   errcode |= clGetContextInfo(clGPUContext,
              CL_CONTEXT_DEVICES, dataBytes,
              clDevices, NULL);



   clCommandQue = clCreateCommandQueue(clGPUContext,
                  clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);



   d_C = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_B, h_B, &errcode);


   FILE* fp = fopen("hw2.cl", "r");
   fseek (fp , 0 , SEEK_END);
   const size_t lSize = ftell(fp);
   rewind(fp);
   unsigned char* buffer;
   buffer = (unsigned char*) malloc (lSize);
   fread(buffer, 1, lSize, fp);
   fclose(fp);

   cl_int status;
   clProgram = clCreateProgramWithBinary(clGPUContext,
                1, (const cl_device_id *)clDevices,
                &lSize, (const unsigned char**)&buffer,
                &status, &errcode);
   errcode = clBuildProgram(clProgram, 0, NULL, NULL,
                NULL, NULL);


   errcode = clBuildProgram(clProgram, 0,
              NULL, NULL, NULL, NULL);


   clKernel = clCreateKernel(clProgram,
               "MM", &errcode);




   size_t globalWorkSize[2];

   int wA = WA;
   int wC = WC;
   errcode = clSetKernelArg(clKernel, 0,
              sizeof(cl_mem), (void *)&d_C);
   errcode |= clSetKernelArg(clKernel, 1,
              sizeof(cl_mem), (void *)&d_A);
   errcode |= clSetKernelArg(clKernel, 2,
              sizeof(cl_mem), (void *)&d_B);
   errcode |= clSetKernelArg(clKernel, 3,
              sizeof(int), (void *)&wA);
   errcode |= clSetKernelArg(clKernel, 4,
              sizeof(int), (void *)&wC);



   globalWorkSize[0] = 16;
   globalWorkSize[1] = 16;

   cl_ulong time_start, time_end, total_time = 0;

   errcode = clEnqueueNDRangeKernel(clCommandQue,
              clKernel, 2, NULL, globalWorkSize,
              NULL, 0, NULL, &mm);
   printf("Average time = %lu\n");
   clFinish(clCommandQue);

         clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START,
              sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END,
               sizeof(time_end), &time_end, NULL);
         total_time += time_end - time_start;


         printf("Average time = %lu\n", total_time);
   errcode = clEnqueueReadBuffer(clCommandQue,
              d_C, CL_TRUE, 0, mem_size_C,
              h_C, 0, NULL, NULL);



   free(h_A);
   free(h_B);
   free(h_C);

   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);

   free(clDevices);

   clReleaseContext(clGPUContext);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseCommandQueue(clCommandQue);

}
Esempio n. 13
0
int main(int argc, char *argv[]) 
{
  double Mops, t1, t2;
  double tsx, tsy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    i, nit;
  int    k_offset, j;
  logical verified;

  char   size[16];

  FILE *fp;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  setup_opencl(argc, argv);

  timer_clear(0);
  timer_start(0);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  DTIMER_START(T_KERNEL_EMBAR);

  // Launch the kernel
  int q_size  = GROUP_SIZE * NQ * sizeof(cl_double);
  int sx_size = GROUP_SIZE * sizeof(cl_double);
  int sy_size = GROUP_SIZE * sizeof(cl_double);
  err_code  = clSetKernelArg(kernel, 0, q_size, NULL);
  err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
  err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
  err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
  err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
  err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
  err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
  err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
  clu_CheckError(err_code, "clSetKernelArg()");
  
  size_t localWorkSize[] = { GROUP_SIZE };
  size_t globalWorkSize[] = { np };
  err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                                    globalWorkSize, 
                                    localWorkSize,
                                    0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_EMBAR);

  double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size);
  double *gsx = (double*)malloc(gsx_size);
  double *gsy = (double*)malloc(gsy_size);

  gc  = 0.0;
  tsx = 0.0;
  tsy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;
  }

  // 9. Get the result
  DTIMER_START(T_BUFFER_READ);
  err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, 
                                 gq, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, 
                                 gsx, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, 
                                 gsy, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");
  DTIMER_STOP(T_BUFFER_READ);

  for (i = 0; i < np/localWorkSize[0]; i++) {
    for (j = 0; j < NQ; j++ ){
      q[j] = q[j] + gq[i][j];
    }
    tsx = tsx + gsx[i];
    tsy = tsy + gsy[i];
  }

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];
  }

  timer_stop(0);
  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;
  }

  if (verified) {
    sx_err = fabs((tsx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((tsy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));
  }

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", tsx, tsy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);
  }

  c_print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      verified, NPBVERSION, COMPILETIME, 
      CS1, CS2, CS3, CS4, CS5, CS6, CS7,
      clu_GetDeviceTypeName(device_type), device_name);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
  }

  free(gq);
  free(gsx);
  free(gsy);
  release_opencl();

  fflush(stdout);

  return 0;
}
Esempio n. 14
0
void AdvancedMaxPoolingLayer::FeedForward() {
#ifdef BUILD_OPENCL_MAX
  input_->data.MoveToGPU();
  output_->data.MoveToGPU(true);
  maximum_mask_.MoveToGPU(true);
  
  cl_uint error = 0;
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 0, sizeof (cl_mem), &input_->data.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 1, sizeof (cl_mem), &maximum_mask_.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 2, sizeof (cl_mem), &output_->data.cl_data_ptr_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 3, sizeof (unsigned int), &input_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 4, sizeof (unsigned int), &input_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 5, sizeof (unsigned int), &maps_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 6, sizeof (unsigned int), &output_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 7, sizeof (unsigned int), &output_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 8, sizeof (unsigned int), &region_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 9, sizeof (unsigned int), &region_height_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 10, sizeof (unsigned int), &stride_width_);
  error |= clSetKernelArg (CLHelper::k_amaximumForward, 11, sizeof (unsigned int), &stride_height_);
  if (error != CL_SUCCESS) {
    FATAL ("Error setting kernel args: " << (signed int) error);
  }

  size_t global_work_size[] = { output_width_, output_height_, maps_* input_->data.samples() };

  error = clEnqueueNDRangeKernel (CLHelper::queue, CLHelper::k_amaximumForward, 3, NULL,
                                  global_work_size, NULL, 0, NULL, NULL);
  if (error != CL_SUCCESS) {
    FATAL ("Error enqueueing kernel: " << (signed int) error);
  }

#ifdef BRUTAL_FINISH
  error = clFinish (CLHelper::queue);
  if (error != CL_SUCCESS) {
    FATAL ("Error finishing command queue: " << (signed int) error);
  }
#endif

#else
#pragma omp parallel for default(shared)
  for (std::size_t sample = 0; sample < input_->data.samples(); sample++) {
    for (unsigned int map = 0; map < maps_; map++) {
      for (unsigned int ox = 0; ox < output_width_; ox++) {
        for (unsigned int oy = 0; oy < output_height_; oy++) {
          // Find maximum in region
          datum maximum = std::numeric_limits<datum>::lowest();
          unsigned int mix = 0;
          unsigned int miy = 0;
          for (unsigned int iy = oy * stride_height_;
                iy < (oy * stride_height_) + region_height_; iy++) {
            for (unsigned int ix = ox * stride_width_;
                ix < (ox * stride_width_) + region_width_; ix++) {
              const datum ival =
                *input_->data.data_ptr_const (ix, iy, map, sample);
              if (ival > maximum) {
                maximum = ival;
                mix = ix;
                miy = iy;
              }
            }
          }
          
          // Found maximum, save
          *maximum_mask_.data_ptr(ox, oy, map, sample) = input_width_ * miy + mix;
          
          // Feed forward
          *output_->data.data_ptr(ox, oy, map, sample) = maximum;
        }
      }
    }
  }
#endif
}
Esempio n. 15
0
void OpenCLExecuter::ocl_filterBoundingBox(int channel, int window_size)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem bbmin_buffer;				// OpenCL device source buffer
	cl_mem bbmax_buffer;				// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	cl_int4 minbb;
	cl_int4 maxbb;

	minbb.s[0] = minbb.s[1] = minbb.s[2] = 8192;
	maxbb.s[0] = maxbb.s[1] = maxbb.s[2] = -8192;

	int iNumElements = 3*volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process

	size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the destination buffer memory object
	bbmin_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(cl_int4), NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

 	bbmax_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(cl_int4), NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&bbmin_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&bbmax_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&channel);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(src_buffer)clReleaseMemObject(src_buffer);
    if(bbmin_buffer)clReleaseMemObject(bbmin_buffer);
    if(bbmax_buffer)clReleaseMemObject(bbmax_buffer);
	
	maxbb.s[0] += (float)window_size/2.0;
	maxbb.s[1] += (float)window_size/2.0;
	maxbb.s[2] += (float)window_size/2.0;

	minbb.s[0] -= (float)window_size/2.0;
	minbb.s[1] -= (float)window_size/2.0;
	minbb.s[2] -= (float)window_size/2.0;

	maxbb.s[0] += 2;
	maxbb.s[1] += 2;
	maxbb.s[2] += 2;

	minbb.s[0] -= 2;
	minbb.s[1] -= 2;
	minbb.s[2] -= 2;	
	
	if(maxbb.s[0]>volobj->texwidth-1) maxbb.s[0]  =volobj->texwidth-1;
	if(maxbb.s[1]>volobj->texheight-1) maxbb.s[1] =volobj->texheight-1;
	if(maxbb.s[2]>volobj->texdepth-1) maxbb.s[2] =volobj->texdepth-1;

	if(minbb.s[0]<0) minbb.s[0]=0;
	if(minbb.s[1]<0) minbb.s[1]=0;
	if(minbb.s[2]<0) minbb.s[2]=0;

	volobj->boundingboxSize.x = ((maxbb.s[0])-(minbb.s[0]-1));
	volobj->boundingboxSize.y = ((maxbb.s[1])-(minbb.s[1]-1));
	volobj->boundingboxSize.z = ((maxbb.s[2])-(minbb.s[2]-1));
	volobj->boundingboxCentre.x = 0.0; //-(((float)boundingboxSize.x)/2.0);
	volobj->boundingboxCentre.y = 0.0; //-(((float)boundingboxSize.y)/2.0);
	volobj->boundingboxCentre.z = 0.0; //-(((float)boundingboxSize.z)/2.0);
	volobj->boundingboxMin = Vector(minbb.s[0], minbb.s[1], minbb.s[2]);
	volobj->boundingboxMax = Vector(maxbb.s[0], maxbb.s[1], maxbb.s[2]);

	printf("min: %f, %f, %f\n", volobj->boundingboxMin.x, volobj->boundingboxMin.y, volobj->boundingboxMin.z);
	printf("max: %f, %f, %f\n", volobj->boundingboxMax.x, volobj->boundingboxMax.y, volobj->boundingboxMax.z);
}
int main(void) {
//time meassuring
  	struct timeval tvs;
  	struct timeval tve;
    	float elapsedTime;

	int	  Nx;
	int 	  Ny;
	int 	  Nz;
	int	  N;
	int 	  plotnum=0;
	int	  Tmax=0;
	int 	  plottime=0;
	int	  plotgap=0;
	float	  Lx,Ly,Lz;
	float	  dt=0.0;	
	float	  A=0.0;
	float	  B=0.0;
	float	  Du=0.0;
	float	  Dv=0.0;
	float	  a[2]={1.0,0.0};	
	float 	  b[2]={0.5,0.0};
	float*	  x,*y,*z ;
	float*	  u[2],*v[2];
//openCL variables
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem cl_u[2] = {NULL,NULL};
    cl_mem cl_v[2] = {NULL,NULL};
    cl_mem cl_uhat[2] = {NULL,NULL};
    cl_mem cl_vhat[2] = {NULL,NULL};
    cl_mem cl_x = NULL;
    cl_mem cl_y = NULL;
    cl_mem cl_z = NULL;
    cl_mem cl_kx = NULL;
    cl_mem cl_ky = NULL;
    cl_mem cl_kz = NULL;
    cl_program p_grid = NULL,p_frequencies = NULL,p_initialdata = NULL,p_linearpart=NULL,p_nonlinearpart=NULL;
    cl_kernel grid = NULL,frequencies = NULL,initialdata = NULL,linearpart=NULL,nonlinearpart=NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
    	size_t source_size;
    	char *source_str;
//end opencl
	int  i,n;
int status=0;	
 	//int  start, finish, count_rate, ind, numthreads
	char	nameconfig[100]="";
//Read infutfile
	char	InputFileName[]="./INPUTFILE";
	FILE*fp;
	fp=fopen(InputFileName,"r");
   	 if(!fp) {fprintf(stderr, "Failed to load IPUTFILE.\n");exit(1);}	 
	int ierr=fscanf(fp, "%d %d %d %d %d %f %f %f %f %f %f %f %f", &Nx,&Ny,&Nz,&Tmax,&plotgap,&Lx,&Ly,&Lz,&dt,&Du,&Dv,&A,&B);
	if(ierr!=13){fprintf(stderr, "INPUTFILE corrupted.\n");exit(1);}	
	fclose(fp);
	printf("NX %d\n",Nx); 
	printf("NY %d\n",Ny); 
	printf("NZ %d\n",Nz); 
	printf("Tmax %d\n",Tmax);
	printf("plotgap %d\n",plotgap);
	printf("Lx %f\n",Lx);
	printf("Ly %f\n",Ly);
	printf("Lz %f\n",Lz);
	printf("dt %f\n",dt);		
	printf("Du %f\n",Du);
	printf("Dv %f\n",Dv);
	printf("F %f\n",A);
	printf("k %f\n",B);
	printf("Read inputfile\n");
	N=Nx*Ny*Nz;
	plottime=plotgap;
	B=A+B;
//ALLocate the memory
	u[0]=(float*) malloc(N*sizeof(float));
	v[0]=(float*) malloc(N*sizeof(float));
	x=(float*) malloc(Nx*sizeof(float));
	y=(float*) malloc(Ny*sizeof(float));
	z=(float*) malloc(Nz*sizeof(float));

//allocate gpu mem
	cl_u[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_v[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_u[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_v[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_uhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_vhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_uhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_vhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	printf("allocated space\n");

	// FFT library realted declarations. 
	clfftPlanHandle planHandle;
	clfftDim dim = CLFFT_3D;
	size_t clLengths[3] = {Nx, Ny, Nz};
	// Setup clFFT. 
	clfftSetupData fftSetup;
	ret = clfftInitSetupData(&fftSetup);
	ret = clfftSetup(&fftSetup);
	// Create a default plan for a complex FFT. 
	ret = clfftCreateDefaultPlan(&planHandle, context, dim, clLengths);
	// Set plan parameters. 
	ret = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE);
	ret = clfftSetLayout(planHandle, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR);
	ret = clfftSetResultLocation(planHandle, CLFFT_OUTOFPLACE);
	// Bake the plan. 
	ret = clfftBakePlan(planHandle, 1, &command_queue, NULL, NULL);
	// Create temporary buffer. 
	cl_mem tmpBufferu = 0;
	cl_mem tmpBufferv = 0;
	// Size of temp buffer. 
	size_t tmpBufferSize = 0;
	status = clfftGetTmpBufSize(planHandle, &tmpBufferSize);
	if ((status == 0) && (tmpBufferSize > 0)) {
		tmpBufferu = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret);
		tmpBufferv = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret);
		if (ret != CL_SUCCESS)
			printf("Error with tmpBuffer clCreateBuffer\n");
	}
//kernel grid
    	fp = fopen("./grid.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load grid.\n"); exit(1); }
    	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );
	
	p_grid = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_grid, 1, &device_id, NULL, NULL, NULL);
        grid = clCreateKernel(p_grid, "grid", &ret);
//first x
	cl_x = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret);
        ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_x);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lx);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nx);
	size_t global_work_size_x[3] = {Nx, 0, 0};
        ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
        ret = clEnqueueReadBuffer(command_queue, cl_x, CL_TRUE, 0, Nx * sizeof(float), x, 0, NULL, NULL);
	ret = clFinish(command_queue);
//then y
	cl_y = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret);	
	ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_y);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Ly);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Ny);
	size_t global_work_size_y[3] = {Ny, 0, 0};

	ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
        ret = clEnqueueReadBuffer(command_queue, cl_y, CL_TRUE, 0, Ny * sizeof(float), y, 0, NULL, NULL);
	ret = clFinish(command_queue);

//last z
	cl_z = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret);
	ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_z);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lz);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nz);
	size_t global_work_size_z[3] = {Nz, 0, 0};
	ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
	ret = clEnqueueReadBuffer(command_queue, cl_z, CL_TRUE, 0, Nz * sizeof(float), z, 0, NULL, NULL);
	ret = clFinish(command_queue);
    	ret = clReleaseKernel(grid); ret = clReleaseProgram(p_grid);

//kernel initial data
    	fp = fopen("./initialdata.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load initialdata.\n"); exit(1); }
	free(source_str);    	
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );

	p_initialdata = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_initialdata, 1, &device_id, NULL, NULL, NULL);
        initialdata = clCreateKernel(p_initialdata, "initialdata", &ret);


        ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u[0]);
	ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v[0]);
        ret = clSetKernelArg(initialdata, 2, sizeof(cl_mem),(void *)&cl_u[1]);
	ret = clSetKernelArg(initialdata, 3, sizeof(cl_mem),(void* )&cl_v[1]);
	ret = clSetKernelArg(initialdata, 4, sizeof(cl_mem),(void* )&cl_x);
	ret = clSetKernelArg(initialdata, 5, sizeof(cl_mem),(void* )&cl_y);
	ret = clSetKernelArg(initialdata, 6, sizeof(cl_mem),(void* )&cl_z);
	ret = clSetKernelArg(initialdata, 7, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(initialdata, 8, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(initialdata, 9, sizeof(int),(void* )&Nz);
	size_t global_work_size[3] = {N, 0, 0};
        ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
	ret = clReleaseKernel(initialdata); ret = clReleaseProgram(p_initialdata);
        ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL);
	ret = clFinish(command_queue);
        ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL);
	ret = clFinish(command_queue);
	ret = clReleaseMemObject(cl_x);
	ret = clReleaseMemObject(cl_y);
	ret = clReleaseMemObject(cl_z);
//write to disk
	fp=fopen("./data/xcoord.dat","w");
    	if (!fp) {fprintf(stderr, "Failed to write xcoord.dat.\n"); exit(1); }
	for(i=0;i<Nx;i++){fprintf(fp,"%f\n",x[i]);}
    	fclose( fp );
	fp=fopen("./data/ycoord.dat","w");
    	if (!fp) {fprintf(stderr, "Failed to write ycoord.dat.\n"); exit(1); }
	for(i=0;i<Ny;i++){fprintf(fp,"%f\n",y[i]);}
    	fclose( fp );
	fp=fopen("./data/zcoord.dat","w");
    	if (!fp) {fprintf(stderr, "Failed to write zcoord.dat.\n"); exit(1); }
	for(i=0;i<Nz;i++){fprintf(fp,"%f\n",z[i]);}
    	fclose( fp );
	free(x); free(y); free(z);
	n=0;
	plotnum=0;
//output of initial data U
	char tmp_str[10];
	strcpy(nameconfig,"./data/u");
	sprintf(tmp_str,"%d",10000000+plotnum);
	strcat(nameconfig,tmp_str);
	strcat(nameconfig,".datbin");
	fp=fopen(nameconfig,"wb");
    	if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); }
	for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);}
    	fclose( fp );	
//V
	strcpy(nameconfig,"./data/v");
	sprintf(tmp_str,"%d",10000000+plotnum);
	strcat(nameconfig,tmp_str);
	strcat(nameconfig,".datbin");
	fp=fopen(nameconfig,"wb");
    	if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); }
	for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);}
    	fclose( fp );


//frequencies kernel

    	fp = fopen("./frequencies.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load frequencies.\n"); exit(1); }
	free(source_str);
    	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );
	
	p_frequencies = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_frequencies, 1, &device_id, NULL, NULL, NULL);
        frequencies = clCreateKernel(p_frequencies, "frequencies", &ret);
//get frequencies first x
	cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret);
        ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kx);
	ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lx);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nx);
        ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//then y
	cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret);	
	ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_ky);
	ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Ly);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Ny);
	ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//last z
	cl_kz = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret);
	ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kz);
	ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lz);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nz);
	ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	printf("Setup grid, fourier frequencies and initialcondition\n");
//load the rest of the kernels
//linearpart kernel
    	fp = fopen("./linearpart.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load linearpart.\n"); exit(1); }
	free(source_str);    	
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );

	p_linearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_linearpart, 1, &device_id, NULL, NULL, NULL);
        linearpart = clCreateKernel(p_linearpart, "linearpart", &ret);

//kernel nonlinear
    	fp = fopen("./nonlinearpart.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load nonlinearpart.\n"); exit(1); }
	free(source_str);    	
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );

	p_nonlinearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_nonlinearpart, 1, &device_id, NULL, NULL, NULL);
        nonlinearpart = clCreateKernel(p_nonlinearpart, "nonlinearpart", &ret);

	printf("Got initial data, starting timestepping\n");
  gettimeofday(&tvs, NULL); 
	for(n=0;n<=Tmax;n++){
//linear
	ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu);
	ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv);
	ret = clFinish(command_queue);

        ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]);
        ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]);
        ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]);
        ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]);
	ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx);
	ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky);
	ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz);
	ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt);
	ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du);
	ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv);
	ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A);
	ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B);
	ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]);
	ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]);
	ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz);
        ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu);
	ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv);
	ret = clFinish(command_queue);    
//nonlinearpart
        ret = clSetKernelArg(nonlinearpart, 0, sizeof(cl_mem),(void *)&cl_u[0]);
        ret = clSetKernelArg(nonlinearpart, 1, sizeof(cl_mem),(void *)&cl_u[1]);
	ret = clSetKernelArg(nonlinearpart, 2, sizeof(cl_mem),(void* )&cl_v[0]);
	ret = clSetKernelArg(nonlinearpart, 3, sizeof(cl_mem),(void* )&cl_v[1]);
	ret = clSetKernelArg(nonlinearpart, 4, sizeof(float),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart, 5, sizeof(float),(void* )&a[0]);
	ret = clSetKernelArg(nonlinearpart, 6, sizeof(float),(void* )&a[1]);
        ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);		
// linear part
	ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu);
	ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv);	
	ret = clFinish(command_queue);

        ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]);
        ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]);
        ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]);
        ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]);
	ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx);
	ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky);
	ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz);
	ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt);
	ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du);
	ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv);
	ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A);
	ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B);
	ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]);
	ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]);
	ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz);
        ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu);
	ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv);
	ret = clFinish(command_queue);
// done
	if(n==plottime){
		printf("time:%f, step:%d,%d\n",n*dt,n,plotnum);
		plottime=plottime+plotgap;
		plotnum=plotnum+1;
        ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL);
        ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL);
	ret = clFinish(command_queue);
//output of data U
	char tmp_str[10];
	strcpy(nameconfig,"./data/u");
	sprintf(tmp_str,"%d",10000000+plotnum);
	strcat(nameconfig,tmp_str);
	strcat(nameconfig,".datbin");
	fp=fopen(nameconfig,"wb");
    	if (!fp) {fprintf(stderr, "Failed to write u-data.\n"); exit(1); }
	for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);}
    	fclose( fp );	
//V
	strcpy(nameconfig,"./data/v");
	sprintf(tmp_str,"%d",10000000+plotnum);
	strcat(nameconfig,tmp_str);
	strcat(nameconfig,".datbin");
	fp=fopen(nameconfig,"wb");
    	if (!fp) {fprintf(stderr, "Failed to write v-data.\n"); exit(1); }
	for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);}
    	fclose( fp );
}
	}
 	gettimeofday(&tve, NULL); 
	printf("Finished time stepping\n");
 	elapsedTime = (tve.tv_sec - tvs.tv_sec) * 1000.0;      // sec to ms
    	elapsedTime += (tve.tv_usec - tvs.tv_usec) / 1000.0;   // us to ms
   	printf("%f,",elapsedTime);



	clReleaseMemObject(cl_u[0]);
	clReleaseMemObject(cl_u[1]);
	clReleaseMemObject(cl_v[0]);
	clReleaseMemObject(cl_v[1]);
	clReleaseMemObject(cl_uhat[0]);
	clReleaseMemObject(cl_uhat[1]);
	clReleaseMemObject(cl_vhat[0]);
	clReleaseMemObject(cl_vhat[1]);
	clReleaseMemObject(cl_kx);
	clReleaseMemObject(cl_ky);
	clReleaseMemObject(cl_kz);
    	ret = clReleaseKernel(frequencies); ret = clReleaseProgram(p_frequencies);
    	ret = clReleaseKernel(linearpart); ret = clReleaseProgram(p_linearpart);
    	ret = clReleaseKernel(nonlinearpart); ret = clReleaseProgram(p_nonlinearpart);
	free(u[0]);
	free(v[0]);
	clReleaseMemObject(tmpBufferu);
	clReleaseMemObject(tmpBufferv);
	/* Release the plan. */
	ret = clfftDestroyPlan(&planHandle);
	/* Release clFFT library. */
	clfftTeardown();

	ret = clReleaseCommandQueue(command_queue);
     	ret = clReleaseContext(context);	
	printf("Program execution complete\n");

	return 0;
}
Esempio n. 17
0
void OpenCLExecuter::ocl_filterPeronaMalik(float lambda, float dT, unsigned char* src_array, unsigned char* dst_array, int w, int h, int d)
{
	float lambda2 = lambda*lambda;

	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = w*h*d; // Length of float arrays to process

	size_t global_threads[3] ={w,h,d};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(float), (void*)&lambda2);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(float), (void*)&dT);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texwidth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&volobj->texheight);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&volobj->texdepth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(dst_buffer)clReleaseMemObject(dst_buffer);
    if(src_buffer)clReleaseMemObject(src_buffer);
}
Esempio n. 18
0
int main(void)
{
    float *h_psum;              // vector to hold partial sum
    int in_nsteps = INSTEPS;    // default number of steps (updated later to device preferable)
    int niters = ITERS;         // number of iterations
    int nsteps;
    float step_size;
    size_t nwork_groups;
    size_t max_size, work_group_size = 8;
    float pi_res;

    cl_mem d_partial_sums;

    char *kernelsource = getKernelSource("../pi_ocl.cl");             // Kernel source

    cl_int err;
    cl_device_id     device_id;     // compute device id 
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        kernel_pi;     // compute kernel

    // Set up OpenCL context. queue, kernel, etc.
    cl_uint numPlatforms;
    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Secure a device
    for (int i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }
    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Output information
    err = output_device_info(device_id);
    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Build the program  
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }
    // Create the compute kernel from the program 
    kernel_pi = clCreateKernel(program, "pi", &err);
    if (!kernel_pi || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // Find kernel work-group size
    err = clGetKernelWorkGroupInfo (kernel_pi, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Now that we know the size of the work-groups, we can set the number of
    // work-groups, the actual number of steps, and the step size
    nwork_groups = in_nsteps/(work_group_size*niters);

    if (nwork_groups < 1)
    {
        err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL);
        work_group_size = in_nsteps / (nwork_groups * niters);
    }

    nsteps = work_group_size * niters * nwork_groups;
    step_size = 1.0f/(float)nsteps;
    h_psum = calloc(sizeof(float), nwork_groups);
    if (!h_psum)
    {
        printf("Error: could not allocate host memory for h_psum\n");
        return EXIT_FAILURE;
    }

    printf(" %ld work-groups of size %ld. %d Integration steps\n",
            nwork_groups,
            work_group_size,
            nsteps);

    d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // Set kernel arguments
    err  = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters);
    err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size);
    err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL);
    err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments!\n");
        return EXIT_FAILURE;
    }

    // Execute the kernel over the entire range of our 1D input data set
    // using the maximum number of work items for this device
    size_t global = nwork_groups * work_group_size;
    size_t local = work_group_size;
    double rtime = wtime();
    err = clEnqueueNDRangeKernel(
        commands,
        kernel_pi,
        1, NULL,
        &global,
        &local,
        0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to execute kernel\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }


    err = clEnqueueReadBuffer(
        commands,
        d_partial_sums,
        CL_TRUE,
        0,
        sizeof(float) * nwork_groups,
        h_psum,
        0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // complete the sum and compute the final integral value on the host
    pi_res = 0.0f;
    for (unsigned int i = 0; i < nwork_groups; i++)
    {
        pi_res += h_psum[i];
    }
    pi_res *= step_size;

    rtime = wtime() - rtime;

    printf("\nThe calculation ran in %lf seconds\n", rtime);
    printf(" pi = %f for %d steps\n", pi_res, nsteps);

    // clean up
    clReleaseMemObject(d_partial_sums);
    clReleaseProgram(program);
    clReleaseKernel(kernel_pi);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    free(kernelsource);
    free(h_psum);
}
Esempio n. 19
0
void OpenCLExecuter::ocl_filterGaussian(unsigned char* src_array, unsigned char* dst_array, int w, int h, int d)
{
//	printf("gaussian_sum: %f\n", gaussian_sum);
	printf("gaussian_width: %d\n", filter_width);
	printf("gaussian_mask size: %d\n", filter_kernel.size());

	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem gauss_buffer;				// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = w*h*d; // Length of float arrays to process

	size_t global_threads[3] ={w,h,d};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	gauss_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(float) * filter_kernel.size(), NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
 
	//==================================================
	// X axis 
	//==================================================

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "gaussianX", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	/*size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);*/

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	//err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL,  NULL);
	//printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
	//==================================================
	// Y axis 
	//==================================================

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "gaussianY", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

/*	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);
*/
	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	//err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL);
	//printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	//err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL,  NULL);
	//printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
	//==================================================
	// Z axis 
	//==================================================

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "gaussianZ", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	/*size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);
	*/

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	//Prepare data to upload
	//for(int j=0; j<iNumElements; j++)
	//	data[j] = volobj->texture3d[3*j+0];

	// Write data from host to GPU
	//err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL);
	//printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(dst_buffer)clReleaseMemObject(dst_buffer);
    if(src_buffer)clReleaseMemObject(src_buffer);
    if(gauss_buffer)clReleaseMemObject(gauss_buffer);
}
// host stub function
void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block,
                                          int dim, int *range, ops_arg arg0,
                                          ops_arg arg1, ops_arg arg2,
                                          ops_arg arg3, ops_arg arg4,
                                          ops_arg arg5, ops_arg arg6,
                                          ops_arg arg7) {

  // Timing
  double t1, t2, c1, c2;

  ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 8, range, 22))
    return;
#endif

  if (OPS_diags > 1) {
    ops_timing_realloc(22, "update_halo_kernel1_fr2");
    OPS_kernels[22].count++;
    ops_timers_core(&c1, &t1);
  }

  // compute locally allocated range for the sub-block
  int start[3];
  int end[3];
#ifdef OPS_MPI
  sub_block_list sb = OPS_sub_block_list[block->index];
  if (!sb->owned)
    return;
  for (int n = 0; n < 3; n++) {
    start[n] = sb->decomp_disp[n];
    end[n] = sb->decomp_disp[n] + sb->decomp_size[n];
    if (start[n] >= range[2 * n]) {
      start[n] = 0;
    } else {
      start[n] = range[2 * n] - start[n];
    }
    if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0)
      start[n] = range[2 * n];
    if (end[n] >= range[2 * n + 1]) {
      end[n] = range[2 * n + 1] - sb->decomp_disp[n];
    } else {
      end[n] = sb->decomp_size[n];
    }
    if (sb->id_p[n] == MPI_PROC_NULL &&
        (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n]))
      end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]);
  }
#else
  for (int n = 0; n < 3; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }
#endif

  int x_size = MAX(0, end[0] - start[0]);
  int y_size = MAX(0, end[1] - start[1]);
  int z_size = MAX(0, end[2] - start[2]);

  int xdim0 = args[0].dat->size[0];
  int ydim0 = args[0].dat->size[1];
  int xdim1 = args[1].dat->size[0];
  int ydim1 = args[1].dat->size[1];
  int xdim2 = args[2].dat->size[0];
  int ydim2 = args[2].dat->size[1];
  int xdim3 = args[3].dat->size[0];
  int ydim3 = args[3].dat->size[1];
  int xdim4 = args[4].dat->size[0];
  int ydim4 = args[4].dat->size[1];
  int xdim5 = args[5].dat->size[0];
  int ydim5 = args[5].dat->size[1];
  int xdim6 = args[6].dat->size[0];
  int ydim6 = args[6].dat->size[1];

  // build opencl kernel if not already built

  buildOpenCLKernels_update_halo_kernel1_fr2(xdim0, ydim0, xdim1, ydim1, xdim2,
                                             ydim2, xdim3, ydim3, xdim4, ydim4,
                                             xdim5, ydim5, xdim6, ydim6);

  // set up OpenCL thread blocks
  size_t globalWorkSize[3] = {
      ((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x,
      ((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y,
      ((z_size - 1) / OPS_block_size_z + 1) * OPS_block_size_z};
  size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y,
                             OPS_block_size_z};

  int *arg7h = (int *)arg7.data;

  int consts_bytes = 0;

  consts_bytes += ROUND_UP(NUM_FIELDS * sizeof(int));

  reallocConstArrays(consts_bytes);

  consts_bytes = 0;
  arg7.data = OPS_consts_h + consts_bytes;
  arg7.data_d = OPS_consts_d + consts_bytes;
  for (int d = 0; d < NUM_FIELDS; d++)
    ((int *)arg7.data)[d] = arg7h[d];
  consts_bytes += ROUND_UP(NUM_FIELDS * sizeof(int));
  mvConstArraysToDevice(consts_bytes);

  // set up initial pointers
  int d_m[OPS_MAX_DIM];
#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[0].dat->d_m[d];
#endif
  int base0 = 1 * 1 * (start[0] * args[0].stencil->stride[0] -
                       args[0].dat->base[0] - d_m[0]);
  base0 = base0 +
          args[0].dat->size[0] * 1 * (start[1] * args[0].stencil->stride[1] -
                                      args[0].dat->base[1] - d_m[1]);
  base0 = base0 +
          args[0].dat->size[0] * 1 * args[0].dat->size[1] * 1 *
              (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[1].dat->d_m[d];
#endif
  int base1 = 1 * 1 * (start[0] * args[1].stencil->stride[0] -
                       args[1].dat->base[0] - d_m[0]);
  base1 = base1 +
          args[1].dat->size[0] * 1 * (start[1] * args[1].stencil->stride[1] -
                                      args[1].dat->base[1] - d_m[1]);
  base1 = base1 +
          args[1].dat->size[0] * 1 * args[1].dat->size[1] * 1 *
              (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[2].dat->d_m[d];
#endif
  int base2 = 1 * 1 * (start[0] * args[2].stencil->stride[0] -
                       args[2].dat->base[0] - d_m[0]);
  base2 = base2 +
          args[2].dat->size[0] * 1 * (start[1] * args[2].stencil->stride[1] -
                                      args[2].dat->base[1] - d_m[1]);
  base2 = base2 +
          args[2].dat->size[0] * 1 * args[2].dat->size[1] * 1 *
              (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[3].dat->d_m[d];
#endif
  int base3 = 1 * 1 * (start[0] * args[3].stencil->stride[0] -
                       args[3].dat->base[0] - d_m[0]);
  base3 = base3 +
          args[3].dat->size[0] * 1 * (start[1] * args[3].stencil->stride[1] -
                                      args[3].dat->base[1] - d_m[1]);
  base3 = base3 +
          args[3].dat->size[0] * 1 * args[3].dat->size[1] * 1 *
              (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[4].dat->d_m[d];
#endif
  int base4 = 1 * 1 * (start[0] * args[4].stencil->stride[0] -
                       args[4].dat->base[0] - d_m[0]);
  base4 = base4 +
          args[4].dat->size[0] * 1 * (start[1] * args[4].stencil->stride[1] -
                                      args[4].dat->base[1] - d_m[1]);
  base4 = base4 +
          args[4].dat->size[0] * 1 * args[4].dat->size[1] * 1 *
              (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[5].dat->d_m[d];
#endif
  int base5 = 1 * 1 * (start[0] * args[5].stencil->stride[0] -
                       args[5].dat->base[0] - d_m[0]);
  base5 = base5 +
          args[5].dat->size[0] * 1 * (start[1] * args[5].stencil->stride[1] -
                                      args[5].dat->base[1] - d_m[1]);
  base5 = base5 +
          args[5].dat->size[0] * 1 * args[5].dat->size[1] * 1 *
              (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[6].dat->d_m[d];
#endif
  int base6 = 1 * 1 * (start[0] * args[6].stencil->stride[0] -
                       args[6].dat->base[0] - d_m[0]);
  base6 = base6 +
          args[6].dat->size[0] * 1 * (start[1] * args[6].stencil->stride[1] -
                                      args[6].dat->base[1] - d_m[1]);
  base6 = base6 +
          args[6].dat->size[0] * 1 * args[6].dat->size[1] * 1 *
              (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] -
               d_m[2]);

  ops_H_D_exchanges_device(args, 8);
  ops_halo_exchanges(args, 8, range);
  ops_H_D_exchanges_device(args, 8);

  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[22].mpi_time += t2 - t1;
  }

  if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) {

    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 0, sizeof(cl_mem),
                              (void *)&arg0.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 1, sizeof(cl_mem),
                              (void *)&arg1.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 2, sizeof(cl_mem),
                              (void *)&arg2.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 3, sizeof(cl_mem),
                              (void *)&arg3.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 4, sizeof(cl_mem),
                              (void *)&arg4.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 5, sizeof(cl_mem),
                              (void *)&arg5.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 6, sizeof(cl_mem),
                              (void *)&arg6.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 7, sizeof(cl_mem),
                              (void *)&arg7.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 8, sizeof(cl_int),
                              (void *)&base0));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 9, sizeof(cl_int),
                              (void *)&base1));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 10, sizeof(cl_int),
                              (void *)&base2));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 11, sizeof(cl_int),
                              (void *)&base3));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 12, sizeof(cl_int),
                              (void *)&base4));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 13, sizeof(cl_int),
                              (void *)&base5));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 14, sizeof(cl_int),
                              (void *)&base6));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 15, sizeof(cl_int),
                              (void *)&x_size));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 16, sizeof(cl_int),
                              (void *)&y_size));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 17, sizeof(cl_int),
                              (void *)&z_size));

    // call/enque opencl kernel wrapper function
    clSafeCall(clEnqueueNDRangeKernel(
        OPS_opencl_core.command_queue, OPS_opencl_core.kernel[22], 3, NULL,
        globalWorkSize, localWorkSize, 0, NULL, NULL));
  }
  if (OPS_diags > 1) {
    clSafeCall(clFinish(OPS_opencl_core.command_queue));
  }

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[22].time += t1 - t2;
  }

  ops_set_dirtybit_device(args, 8);
  ops_set_halo_dirtybit3(&args[0], range);
  ops_set_halo_dirtybit3(&args[1], range);
  ops_set_halo_dirtybit3(&args[2], range);
  ops_set_halo_dirtybit3(&args[3], range);
  ops_set_halo_dirtybit3(&args[4], range);
  ops_set_halo_dirtybit3(&args[5], range);
  ops_set_halo_dirtybit3(&args[6], range);

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c2, &t2);
    OPS_kernels[22].mpi_time += t2 - t1;
    OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1);
    OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2);
    OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3);
    OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4);
    OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5);
    OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6);
  }
}
Esempio n. 21
0
   int main() {

         // Create the variables for the time measure
         int starttime, stoptime;

         //Get initial time
         starttime = GetTimeMs();

         // This code executes on the OpenCL host

         // Host data
         float *A=NULL; // Input array
         float *B=NULL; // Input array
         float *C=NULL; // Output array

         // Elements in each array
          const int elements=2048;
         // Compute the size of the data
         size_t datasize=sizeof(int)*elements;

         // Allocate space for input/output data
         A=(float*)malloc(datasize);
         B=(float*)malloc(datasize);
         C=(float*)malloc(datasize);

         // Initialize the input data
         A[0]=2.2;
         A[1]=1.3;
         B[0]=3.7;
         B[1]=5.4;


         // Load the kernel source code into the array programSource
	     FILE *fp;
	     char *programSource;
	     size_t programSize;
	 
	     fp = fopen("fplos_kernels.cl", "r");
	     if (!fp) {
	         fprintf(stderr, "Failed to load kernel.\n");
	         exit(1);
	     }
	     programSource = (char*)malloc(MAX_SOURCE_SIZE);
	     fclose( fp );

         // Use this to check the output of each API call
         cl_int status;

         // Retrieve the number of platforms
         cl_uint numPlatforms=0;
         status=clGetPlatformIDs(0, NULL,&numPlatforms);

         // Allocate enough space for each platform
         cl_platform_id *platforms=NULL;
         platforms=(cl_platform_id*)malloc(
              numPlatforms*sizeof(cl_platform_id));

         // Fill in the platforms
         status = clGetPlatformIDs(numPlatforms, platforms, NULL);

         // Retrieve the number of devices
         cl_uint numDevices=0;
         status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0,
               NULL,&numDevices);

         // Allocate enough space for each device
         cl_device_id *devices;
         devices = (cl_device_id*)malloc(
               numDevices*sizeof(cl_device_id));

         // Fill in the devices
         status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL,
              numDevices, devices, NULL);

         // Create a context and associate it with the devices
         cl_context context;
         context = clCreateContext(NULL, numDevices, devices, NULL,
             NULL, &status);

         // Create a command queue and associate it with the device
         cl_command_queue cmdQueue;
         cmdQueue = clCreateCommandQueue(context, devices[0], 0,
            &status);

         // Create a buffer object that will contain the data
         // from the host array A
         cl_mem bufA;
         bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize,
             NULL, &status);

         // Create a buffer object that will contain the data
         // from the host array B
         cl_mem bufB;
         bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize,
            NULL, &status);

         // Create a buffer object that will hold the output data
         cl_mem bufC;
         bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize,
            NULL, &status);

         // Write input array A to the device buffer bufferA
         status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE,
            0, datasize, A, 0, NULL, NULL);

         // Write input array B to the device buffer bufferB
         status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE,
            0, datasize, B, 0, NULL, NULL);

         // Create a program with source code
         cl_program program=clCreateProgramWithSource(context, 1,
            (const char**)&programSource, NULL, &status);

         // Build (compile) the program for the device
         status=clBuildProgram(program, numDevices, devices,
            NULL, NULL, NULL);

         // Create the vector addition kernel
         cl_kernel kernel;
         kernel=clCreateKernel(program, "floatadd", &status);

         // Associate the input and output buffers with the kernel
         status=clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
         status=clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
         status=clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);

         // Define an index space (global work size) of work
         // items for execution. A workgroup size (local work size)
         // is not required, but can be used.
         size_t globalWorkSize[1];

         // There are 'elements' work-items
         globalWorkSize[0]=elements;

         // Execute the kernel for execution
         status=clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL,
            globalWorkSize, NULL, 0, NULL, NULL);

         // Read the device output buffer to the host output array
         clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0,
            datasize, C, 0, NULL, NULL);

           printf("Output = %.1f\n", C[0]);
           printf("Output = %.1f\n", C[1]);

        // Free OpenCL resources
         clReleaseKernel(kernel);
         clReleaseProgram(program);
         clReleaseCommandQueue(cmdQueue);
         clReleaseMemObject(bufA);
         clReleaseMemObject(bufB);
         clReleaseMemObject(bufC);
         clReleaseContext(context);

         // Free host resources
         free(A);
         free(B);
         free(C);
         free(platforms);
         free(devices);

         //Get initial time
         stoptime = GetTimeMs();

         printf("Duration= %d ms\n", stoptime - starttime);
            
         return 0;
   }
Esempio n. 22
0
int main(int argc, const char * argv[])
{
	
	 //First we set the variables for measuring performance.
	 
	 struct timeval tim1, tim2;	                
	 uint64_t time;
	 
	 //Calling the function "gettimeofday" to measure the time before the program executes.
	 gettimeofday(&tim1, NULL);
		                

	/*
	 * These are the declarations of the OpenCL structures are described below:
	 * cl_platform-id - Stores the types of platforms installed on the host.
	 * cl_device_id - Stores the type of the device (GPU, CPU, etc.)
	 * cl_context - Stores the context in which a command queue can be created.
	 * cl_command_queue - Stores the command queue which governs how the GPU will
	 *                    will execute the kernel.
	 * cl_program - Stores the kernel code (which can be comprised of several kernels). Is compiled later its
	 * 				functions get packaged into kernels.
	 * cl_kernel - The OpenCL data structure that represents kernels.
	 */

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
	cl_program program;
    cl_kernel kernel;
	
	//A cl_int used to store error flags that are returned if OpenCL function does not execute properly. 
	cl_int err;

    /*
	 * A file object and buffers used to store the input kernel code as well as allocate the memory for the kernel code 
	 * and the output log from the compiler during the compilation of the kernel code.
	 */
    
	FILE *program_handle;
    char *program_buffer, *program_log;
	size_t program_size, log_size;

	//The number of work items in each dimension of the data.
    size_t work_units_per_kernel;
    
	//This value determines the size of the nxn (square) array.
    int n = 1000;
    
	//Allocating the memory for the nxn arrays of floats.
    float **h_xx = (float**)malloc(sizeof(float*)*n);
    float **h_yy = (float**)malloc(sizeof(float*)*n);
    float **h_zz = (float**)malloc(sizeof(float*)*n);
    
    for(int i = 0; i<n; i++){
        h_xx[i] = (float*)malloc(sizeof(float)*n);
        h_yy[i] = (float*)malloc(sizeof(float)*n);
        h_zz[i] = (float*)malloc(sizeof(float)*n);
        
		//Initializing the arrays.
        for(int j = 0; j<n; j++){
            
            h_xx[i][j] = i+j;
            h_yy[i][j] = i+j;
            
        }
       
    }
    	
	/*
	 * These three variables of the type cl_mem (memory object) are used as buffers and hold the data which will
	 * be sent to the device and then once calculated sent back to the host.
	 */

    cl_mem d_xx;
    cl_mem d_yy;
    cl_mem d_zz;

	
    
    // Obtains the Platform information installed on the host and stores into the memory location of the variable "platform"
    err = clGetPlatformIDs(1, &platform, NULL);
    if(err != CL_SUCCESS){
        
        std::cout << "Error: Failed to locate Platform." << std::endl;
        exit(1);
    }
    
	// Obtains the device information (looking for specifically GPU devices) and stores it into the memory location of the variable "device"
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if(err != CL_SUCCESS){
        printf("Error: Failed to locate Device.");
        exit(1);
    }
    
    // Creates a context on the device and stores it into the "context" variable.  
	context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create context." << std::endl;
        exit(1);
    }
    
    /*
	 * The following code stores the file "arraySum.cl" into the FILE object "program_handle". It then determines the size
	 * of the file and reads the content into the variable "program_buffer".
	 */

	program_handle = fopen("flopstestloop.cl", "r");
    if(!program_handle){
        std::cout << "Error: Failed to Load Kernel" << std::endl;
        exit(1);
    }
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);
    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);
    
    // Stores the kernel code into a program and stores it into the "program" variable.
	program = clCreateProgramWithSource(context, 1, (const char **)&program_buffer, (const size_t *)&program_size, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the program" << std::endl;
        exit(1);
    }
    
    free(program_buffer);

    //Compiles the program and stores the compiled code into the argument "program"
	err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not compile the program" << std::endl;

		/*
		 * The following code first allocates the correct amount of memory in order to store the output of the compilers
		 * build log and then it stores this log into the buffer "program_log". Finally it prints this buffer to the
		 * screen.
		 */

        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        program_log = (char*)malloc(log_size+1);
        program_log[log_size] = '\0';
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
        printf("%s\n", program_log);
        free(program_log);
        exit(1);
    }
    
	//From the compiled code in the program creates a kernel called "arraysum"
    kernel = clCreateKernel(program, "arraysum", &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the kernel" << std::endl;
        exit(1);
    }
    
	//Creates a command queue and stores it into the variable "queue".
    queue = clCreateCommandQueue(context, device, 0, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the queue" << std::endl;
        exit(1);
    }
    
    //Creating the Device memory buffers. These will be used to transfer data from the host to the device and vice versa.
    d_xx = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the buffer d_xx" << std::endl;
        exit(1);
    }
    
    d_yy = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the buffer d_yy" << std::endl;
        exit(1);
    }
    
    d_zz = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the buffer d_zz" << std::endl;
        exit(1);
    }
    
   /*
	* This for loop loops over the each row in the matrices x and y first writes the row to the device memory where 
	* the kernel arguments are then set and then then passed to the compiled kernel code already located on the device. 
	* Once executed, the results are then stored in the d_zz buffer and are read back to the host.
	*/

    for(int i = 0; i<n; i++)
    {
        //Writing the data from the host to the device
        err = clEnqueueWriteBuffer(queue, d_xx, CL_TRUE, 0, sizeof(float)*n, h_xx[i], 0, NULL, NULL);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not write to buffer d_xx" << std::endl;
            exit(1);
        }
        
        err = clEnqueueWriteBuffer(queue, d_yy, CL_TRUE, 0, sizeof(float)*n, h_yy[i], 0, NULL, NULL);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not write to buffer d_yy" << std::endl;
            exit(1);
        }
    
        //Setting the Kernel Arguments
        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_xx);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not set kernel argument h_xx." << std::endl;
            exit(1);
        }
    
        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_yy);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not set kernel argument h_yy." << std::endl;
            exit(1);
        }
    
        err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_zz);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not set kernel argument h_zz." << std::endl;
        }
    
        work_units_per_kernel = n;
    
        //Executing the Kernel
        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not execute kernel." << std::endl;
            exit(1);
        }
    
        //Reading the Data from the Kernel
        err = clEnqueueReadBuffer(queue, d_zz, CL_TRUE, 0, n*(sizeof(float)), h_zz[i], 0, NULL, NULL);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not read data from kernel." << std::endl;
            exit(1);
        }
        
    }

    //Measuring the time after the OpenCL code has executed and has been copied back to the host.
	gettimeofday(&tim2, NULL);
	//Finding the difference between the two measured times.
	time = tim2.tv_sec - tim1.tv_sec;
	//Displaying the elapsed time in seconds.
	std::cout << time + (tim2.tv_usec - tim1.tv_usec)/1000000.00 << std::endl;

    //The previously allocated memory is freed.
    clReleaseMemObject(d_xx);
    clReleaseMemObject(d_yy);
    clReleaseMemObject(d_zz);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);
    
    return 0;
}
Esempio n. 23
0
void spmv_csr_cpu(const csr_matrix* csr,const float* x,const float* y,float* out) {
    int num_rows = csr->num_rows;

    int sourcesize = 1024*1024;
	char * source = (char *)calloc(sourcesize, sizeof(char));
	if(!source) { fprintf(stderr, "ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	// read the kernel core source
	char * kernel_csr_src  = "csr_ocl";
	char * tempchar = "./spmv_kernel.cl";
	FILE * fp = fopen(tempchar, "rb");
	if(!fp) { fprintf(stderr, "ERROR: unable to open '%s'\n", tempchar); return -1; }
	fread(source + strlen(source), sourcesize, 1, fp);
	fclose(fp);

	int use_gpu = 1;
	if(initialize(use_gpu)) return -1;

	// compile kernel
	cl_int err = 0;
	const char * slist[2] = { source, 0 };
	cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clBuildProgram() => %d\n", err); return -1; }

	cl_kernel kernel_csr;
	kernel_csr = clCreateKernel(prog, kernel_csr_src, &err);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
	clReleaseProgram(prog);

	cl_mem memAp;
	cl_mem memAj;
	cl_mem memAx;
	cl_mem memx;
	cl_mem memy;

	memAp = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*(csr.num_rows+1), NULL, &err);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}
	memAj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*csr.num_nonzeros, NULL, &err );
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}
	memAx = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err );
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}
	memx = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err );
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}
	memy = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*csr.num_rows, NULL, &err );
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}

	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, memAp, CL_FALSE, 0, sizeof(unsigned int)*csr.num_rows+4, csr->Ap, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, memAj, CL_FALSE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr->Aj, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, memAx, CL_FALSE, 0, sizeof(float)*csr.num_nonzeros, csr->Ax, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, memx, CL_FALSE, 0, sizeof(float)*csr.num_cols, x, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, memy, CL_FALSE, 0, sizeof(float)*csr.num_rows, y, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }

	clSetKernelArg(kernel_csr, 0, sizeof(unsigned int *), (unsigned int *) &csr->num_rows);
	clSetKernelArg(kernel_csr, 1, sizeof(void *), (void*) &memAp);
	clSetKernelArg(kernel_csr, 2, sizeof(void *), (void*) &memAj);
	clSetKernelArg(kernel_csr, 3, sizeof(void *), (void*) &memAx);
	clSetKernelArg(kernel_csr, 2, sizeof(void *), (void*) &memx);
	clSetKernelArg(kernel_csr, 3, sizeof(void *), (void*) &memy);

	err = clEnqueueNDRangeKernel(cmd_queue, kernel_csr, 2, NULL, global_work, local_work, 0, 0, 0);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }

	err = clEnqueueReadBuffer(cmd_queue, memy, 1, 0, sizeof(float)*csr.num_rows, out, 0, 0, 0);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: 1  clEnqueueReadBuffer: out\n"); return -1; }

	clReleaseMemObject(memAp);
	clReleaseMemObject(memAj);
	clReleaseMemObject(memAx);
	clReleaseMemObject(memx);
	clReleaseMemObject(memy);
}
Esempio n. 24
0
void    vectorVectorAdditionGMDP (cl_uint numDevices,cl_device_id *devices, cl_program program,cl_context context,double * h_VectA,double *h_VectB, double *h_Output,int vectSize)
{
	cl_event                gpuExec[1];
        cl_int err;	
	cl_command_queue cmdQueue;   //holds command queue object
	cl_kernel kernel;		//holds kernel object
	cl_mem d_VectA,d_VectB,d_Output;		//holds device input output buffer
	 cl_event                events;        // events
	size_t globalWorkSize[2]={vectSize,vectSize}; //holds global group size

	 double                  gflops=0.0;             //holds total achieved gflops
        cl_ulong startTime, endTime,elapsedTime;        //holds time
        float executionTimeInSeconds;                    //holds total execution time

	  
    	/*create command queue*/
        cmdQueue = clCreateCommandQueue(context, devices[0],  CL_QUEUE_PROFILING_ENABLE, &err);
        if( err != CL_SUCCESS ||  cmdQueue == 0)
         {
               printf("\n\t Failed to create command queue  \n" );
               exit (-1);
         }
        
	/*create kernel object*/
        kernel = clCreateKernel(program,"VectVectAddDPKernel",&err);
        OPENCL_CHECK_STATUS("error while creating kernel",err);
        
	/*create buffer*/
       d_VectA=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(double)*vectSize,h_VectA,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for input",err);
        
       d_VectB=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(double)*vectSize,h_VectB,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for input",err);
        
	d_Output=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(double)*vectSize,NULL,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for d_Output",err);
        
	/*set kernel arg*/
        err=clSetKernelArg(kernel,0,sizeof(cl_mem),&d_VectA);
        OPENCL_CHECK_STATUS("error while setting arg 0",err);
        
	err=clSetKernelArg(kernel,1,sizeof(cl_mem),&d_VectB);
        OPENCL_CHECK_STATUS("error while setting arg 1",err);
        
	err=clSetKernelArg(kernel,2,sizeof(cl_mem),&d_Output);
        OPENCL_CHECK_STATUS("error while setting arg 2",err);
        
	/*load kernel*/
        err = clEnqueueNDRangeKernel(cmdQueue,kernel,2,NULL,globalWorkSize,NULL,0,NULL,&gpuExec[0]);
        OPENCL_CHECK_STATUS("error while creating ND range",err);
        
	//completion of all commands to command queue
        err = clFinish(cmdQueue);
        OPENCL_CHECK_STATUS("clFinish",err);

	/* calculate start time and end time*/
        clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL);
        clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);


	/* total elapsed time*/
        elapsedTime = endTime-startTime;

	/*total execution time*/
        executionTimeInSeconds = (float)(1.0e-9 * elapsedTime);


	
	/* reading buffer object*/
        err = clEnqueueReadBuffer(cmdQueue,d_Output,CL_TRUE,0,sizeof(cl_double)*vectSize,h_Output,0,0,&events);
        OPENCL_CHECK_STATUS("error while reading buffer",err);
	
	
	/* calculate total gflops*/
         gflops= (1.0e-9 * (( vectSize) / executionTimeInSeconds));


        // Print the gflops on the screen
         print_on_screen("Vector Vector Addition double precision using global memory",executionTimeInSeconds,vectSize,gflops,1);


	//check results 
	vectVectAddCheckResultGMDP(h_VectA,h_VectB,h_Output,vectSize);

	//release opencl objects
	clReleaseMemObject(d_VectA);
	clReleaseMemObject(d_VectB);
	clReleaseMemObject(d_Output);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(cmdQueue);
	clReleaseContext(context);
}
// host stub function
void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range,
 ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3,
 ops_arg arg4) {
  ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4};


  ops_timing_realloc(18,"advec_mom_kernel_post_pre_advec_x");
  OPS_kernels[18].count++;

  //compute locally allocated range for the sub-block
  int start[3];
  int end[3];
  #ifdef OPS_MPI
  sub_block_list sb = OPS_sub_block_list[block->index];
  if (!sb->owned) return;
  for ( int n=0; n<3; n++ ){
    start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n];
    if (start[n] >= range[2*n]) {
      start[n] = 0;
    }
    else {
      start[n] = range[2*n] - start[n];
    }
    if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n];
    if (end[n] >= range[2*n+1]) {
      end[n] = range[2*n+1] - sb->decomp_disp[n];
    }
    else {
      end[n] = sb->decomp_size[n];
    }
    if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n]))
      end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]);
  }
  #else //OPS_MPI
  for ( int n=0; n<3; n++ ){
    start[n] = range[2*n];end[n] = range[2*n+1];
  }
  #endif //OPS_MPI

  int x_size = MAX(0,end[0]-start[0]);
  int y_size = MAX(0,end[1]-start[1]);
  int z_size = MAX(0,end[2]-start[2]);


  int xdim0 = args[0].dat->size[0]*args[0].dat->dim;
  int ydim0 = args[0].dat->size[1];
  int xdim1 = args[1].dat->size[0]*args[1].dat->dim;
  int ydim1 = args[1].dat->size[1];
  int xdim2 = args[2].dat->size[0]*args[2].dat->dim;
  int ydim2 = args[2].dat->size[1];
  int xdim3 = args[3].dat->size[0]*args[3].dat->dim;
  int ydim3 = args[3].dat->size[1];
  int xdim4 = args[4].dat->size[0]*args[4].dat->dim;
  int ydim4 = args[4].dat->size[1];

  //build opencl kernel if not already built

  buildOpenCLKernels_advec_mom_kernel_post_pre_advec_x(
  xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4);

  //Timing
  double t1,t2,c1,c2;
  ops_timers_core(&c2,&t2);

  //set up OpenCL thread blocks
  size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, MAX(1,end[2]-start[2])};
  size_t localWorkSize[3] =  {OPS_block_size_x,OPS_block_size_y,1};





  int dat0 = args[0].dat->elem_size;
  int dat1 = args[1].dat->elem_size;
  int dat2 = args[2].dat->elem_size;
  int dat3 = args[3].dat->elem_size;
  int dat4 = args[4].dat->elem_size;

  //set up initial pointers
  int d_m[OPS_MAX_DIM];
  #ifdef OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d];
  #else //OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d];
  #endif //OPS_MPI
  int base0 = 1 * 
  (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]);
  base0 = base0 + args[0].dat->size[0] *
  (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]);
  base0 = base0 + args[0].dat->size[0] *  args[0].dat->size[1] *
  (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]);

  #ifdef OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d];
  #else //OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d];
  #endif //OPS_MPI
  int base1 = 1 * 
  (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]);
  base1 = base1 + args[1].dat->size[0] *
  (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]);
  base1 = base1 + args[1].dat->size[0] *  args[1].dat->size[1] *
  (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]);

  #ifdef OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d];
  #else //OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d];
  #endif //OPS_MPI
  int base2 = 1 * 
  (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]);
  base2 = base2 + args[2].dat->size[0] *
  (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]);
  base2 = base2 + args[2].dat->size[0] *  args[2].dat->size[1] *
  (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]);

  #ifdef OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d];
  #else //OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d];
  #endif //OPS_MPI
  int base3 = 1 * 
  (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]);
  base3 = base3 + args[3].dat->size[0] *
  (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]);
  base3 = base3 + args[3].dat->size[0] *  args[3].dat->size[1] *
  (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]);

  #ifdef OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d];
  #else //OPS_MPI
  for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d];
  #endif //OPS_MPI
  int base4 = 1 * 
  (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]);
  base4 = base4 + args[4].dat->size[0] *
  (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]);
  base4 = base4 + args[4].dat->size[0] *  args[4].dat->size[1] *
  (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]);


  ops_H_D_exchanges_device(args, 5);
  ops_halo_exchanges(args,5,range);
  ops_H_D_exchanges_device(args, 5);

  ops_timers_core(&c1,&t1);
  OPS_kernels[18].mpi_time += t1-t2;


  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 0, sizeof(cl_mem), (void*) &arg0.data_d ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 1, sizeof(cl_mem), (void*) &arg1.data_d ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 2, sizeof(cl_mem), (void*) &arg2.data_d ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 3, sizeof(cl_mem), (void*) &arg3.data_d ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 4, sizeof(cl_mem), (void*) &arg4.data_d ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 5, sizeof(cl_int), (void*) &base0 ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 6, sizeof(cl_int), (void*) &base1 ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 7, sizeof(cl_int), (void*) &base2 ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 8, sizeof(cl_int), (void*) &base3 ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 9, sizeof(cl_int), (void*) &base4 ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 10, sizeof(cl_int), (void*) &x_size ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 11, sizeof(cl_int), (void*) &y_size ));
  clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 12, sizeof(cl_int), (void*) &z_size ));

  //call/enque opencl kernel wrapper function
  clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[18], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) );
  if (OPS_diags>1) {
    clSafeCall( clFinish(OPS_opencl_core.command_queue) );
  }

  ops_set_dirtybit_device(args, 5);
  ops_set_halo_dirtybit3(&args[0],range);
  ops_set_halo_dirtybit3(&args[3],range);

  //Update kernel record
  ops_timers_core(&c2,&t2);
  OPS_kernels[18].time += t2-t1;
  OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg0);
  OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg1);
  OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg2);
  OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg3);
  OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg4);
}
Esempio n. 26
0
void OpenCLExecuter::ocl_filter_multi(void)
{
	cl_int err;										// debugging variables
	size_t szParmDataBytes;							// Byte size of context information        

	cl_mem src_buffer[MAX_DEVICES];					// OpenCL device source buffer
	cl_mem dst_buffer[MAX_DEVICES];					// OpenCL device source buffer
	cl_command_queue queues[MAX_DEVICES];			// OpenCL device queue
	cl_kernel ckKernel[MAX_DEVICES];				// OpenCL kernel

	cl_event gpuDone[MAX_DEVICES];

//	int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth*3; // Length of float arrays to process

	int xdim, ydim, zdim;
	xdim = (float)volobj->texwidth; // (float)ocl_wrapper->numDevices;
	ydim = (float)volobj->texheight; // (float)ocl_wrapper->numDevices;
	zdim = (float)volobj->texdepth / (float)ocl_wrapper->numDevices;

	//Length of array to process
	int iNumElements = (xdim*ydim*zdim);
	size_t global_threads[3] = {xdim, ydim, zdim};
	
	//temp array
	unsigned char** data = new unsigned char*[ocl_wrapper->numDevices];

	for(int i=0; i<ocl_wrapper->numDevices; i++)
		data[i] = new unsigned char[iNumElements];

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		printf("OPENCL: Computing Device%d\n", i);

		//create the command queue we will use to execute OpenCL commands
		queues[i] = clCreateCommandQueue(ocl_wrapper->context, ocl_wrapper->devices[i], 0, &err);
		printf("OPENCL: clCreateCommandQueue: %s\n", ocl_wrapper->get_error(err));
		
		// allocate the source buffer memory object
		src_buffer[i] = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
		printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
		// allocate the destination buffer memory object
		dst_buffer[i] = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
		printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

		// Create the kernel
		ckKernel[i] = clCreateKernel (cpProgram, "myFunc", &err);
		printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
		// Set the Argument values
		err = clSetKernelArg (ckKernel[i], 0, sizeof(cl_mem), (void*)&src_buffer[i]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
		err = clSetKernelArg (ckKernel[i], 1, sizeof(cl_mem), (void*)&dst_buffer[i]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
		err = clSetKernelArg (ckKernel[i], 2, sizeof(int), (void*)&global_threads[0]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
		err = clSetKernelArg (ckKernel[i], 3, sizeof(int), (void*)&global_threads[1]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
		err = clSetKernelArg (ckKernel[i], 4, sizeof(int), (void*)&global_threads[2]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	
		//Prepare data to upload
		int iOffsetElements = (xdim*ydim*zdim*i);
		for(int j=iOffsetElements; j<iNumElements+iOffsetElements; j++)
			data[i][j-iOffsetElements] = volobj->texture3d[3*j+0];

		// Write data from host to GPU
		err = clEnqueueWriteBuffer (queues[i], src_buffer[i], CL_FALSE, 0, sizeof(unsigned char) * iNumElements, data[i], 0, NULL, NULL);
		printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	}

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		// Launch kernel 
		err = clEnqueueNDRangeKernel (queues[i], ckKernel[i], 3, NULL, global_threads, NULL, 0, NULL, NULL);
		printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));
	}

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		// Blocking read of results from GPU to Host
		err = clEnqueueReadBuffer (queues[i], dst_buffer[i], CL_TRUE, 0, sizeof(unsigned char) * iNumElements, data[i], 0, NULL,  &gpuDone[i]);
		printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
	}

	// Synchronize with the GPUs
    printf("OPENCL: Waiting for devices to sync\n");
	clWaitForEvents(ocl_wrapper->numDevices, gpuDone);

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		//read data back
		int iOffsetElements = (xdim*ydim*zdim*i);
		for(int j=iOffsetElements; j<iNumElements+iOffsetElements; j++)
			volobj->texture3d[3*j+0] = data[i][j-iOffsetElements];
	}

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		// Cleanup allocated objects
		printf("OPENCL: Releasing kernel memory\n");
		if(ckKernel[i])clReleaseKernel(ckKernel[i]); 
   
		//need to release any other OpenCL memory objects here
		if(dst_buffer[i])clReleaseMemObject(dst_buffer[i]);
		if(src_buffer[i])clReleaseMemObject(src_buffer[i]);
	}

	for(int i=0; i<ocl_wrapper->numDevices; i++)
		delete[] data[i];

	delete[] data;
}
Esempio n. 27
0
void spmv_bcsr_ocl(b4csr_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable)
{
    cl_device_id* devices = NULL;
    cl_context context = NULL;
    cl_command_queue cmdQueue = NULL;
    cl_program program = NULL;

    assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1);

    cl_int errorCode = CL_SUCCESS;

    //Create device memory objects
    cl_mem devRowPtr;
    cl_mem devColid;
    cl_mem devData;
    cl_mem devVec;
    cl_mem devRes;
    cl_mem devTexVec;

    //Initialize values
    int data_align = mat->b4csr_aligned_size;
    int nnz = mat->matinfo.nnz;
    int rownum = mat->matinfo.height;
    int blockrownum = mat->b4csr_row_num;
    int blocknum = mat->b4csr_block_num;
    int vecsize = mat->matinfo.width;
    int bwidth = mat->b4csr_bwidth;
    int bheight = mat->b4csr_bheight;
    int width4num = bwidth / 4;
    int padveclen = findPaddedSize(vecsize, 8);
    float* paddedvec = (float*)malloc(sizeof(float)*padveclen);
    memset(paddedvec, 0, sizeof(float)*padveclen);
    memcpy(paddedvec, vec, sizeof(float)*vecsize);
    ALLOCATE_GPU_READ(devRowPtr, mat->b4csr_row_ptr, sizeof(int)*(blockrownum + 1));
    ALLOCATE_GPU_READ(devColid, mat->b4csr_col_id, sizeof(int)*blocknum);
    ALLOCATE_GPU_READ(devData, mat->b4csr_data, sizeof(float)*data_align*width4num*bheight);
    ALLOCATE_GPU_READ(devVec, paddedvec, sizeof(float)*padveclen);
    int paddedres = findPaddedSize(rownum, 512);
    devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR;
    errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;
    const cl_image_format floatFormat =
    {
	CL_RGBA,
	CL_FLOAT,
    };


    int width = VEC2DWIDTH;
    int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH;
    if (height % 4 != 0)
	height += (4 - (height % 4));
    float* image2dVec = (float*)malloc(sizeof(float)*width*height);
    memset(image2dVec, 0, sizeof(float)*width*height);
    for (int i = 0; i < vecsize; i++)
    {
	image2dVec[i] = vec[i];
    }
    size_t origin[] = {0, 0, 0};
    size_t vectorSize[] = {width, height/4, 1};
    devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height/4, 0, NULL, &errorCode); CHECKERROR;
    errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR;
    clFinish(cmdQueue);

    //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength);

    opttime = 10000.0f;
    optmethod = 0;
    int dim2 = dim2Size;
    {
	int methodid = 0;
	cl_uint work_dim = 2;
	size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1};
	int gsize = blockrownum * CSR_VEC_GROUP_SIZE;
	size_t globalsize[] = {gsize, dim2};
	int data_align4 = data_align / 4;
	char kernelname[100] = "gpu_bcsr_red_00";
	kernelname[13] += bheight;
	kernelname[14] += bwidth;

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtr); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(int),    &data_align4); CHECKERROR;

	for (int k = 0; k < 3; k++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);

	double teststart = timestamp();
	for (int i = 0; i < ntimes; i++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);
	double testend = timestamp();
	double time_in_sec = (testend - teststart)/(double)dim2;
	double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	printf("\nBCSR %dx%d block cpu time %lf ms GFLOPS %lf code %d \n\n", bheight, bwidth,  time_in_sec / (double) ntimes * 1000, gflops, methodid);

	if (csrKernel)
	    clReleaseKernel(csrKernel);

	double onetime = time_in_sec / (double) ntimes;
	floptable[methodid] = gflops;
	if (onetime < opttime)
	{
	    opttime = onetime;
	    optmethod = methodid;
	    optflop = gflops;
	}

    }

    //Clean up
    if (image2dVec)
	free(image2dVec);

    if (devRowPtr)
	clReleaseMemObject(devRowPtr);
    if (devColid)
	clReleaseMemObject(devColid);
    if (devData)
	clReleaseMemObject(devData);
    if (devVec)
	clReleaseMemObject(devVec);
    if (devTexVec)
	clReleaseMemObject(devTexVec);
    if (devRes)
	clReleaseMemObject(devRes);


    freeObjects(devices, &context, &cmdQueue, &program);
}
Esempio n. 28
0
void OpenCLExecuter::ocl_filter(int src_chan)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process

	//temp array
	unsigned char* data = new unsigned char[iNumElements];

	// set Local work size dimensions
	//szLocalWorkSize = 256;
	// set Global work size dimensions
	//szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize;  
	//szGlobalWorkSize = iNumElements;
//	printf("OPENCL: number of elements: %d\n", (int)iNumElements);
//	printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize);
//	printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize);
//	printf("OPENCL: work groups: %d\n", (int)((float)szGlobalWorkSize/(float)szLocalWorkSize));

	size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	//Prepare data to upload
	for(int j=0; j<iNumElements; j++)
		data[j] = volobj->texture3d[3*j+src_chan];

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, data, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	// Write data from host to GPU
//	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL);
//	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
//	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL);
//	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
		// Blocking read of results from GPU to Host

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, data, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	//read data back
	for(int i=0; i<iNumElements; i++)
	{
			if(volobj->is_greyscale)
				volobj->texture3d[3*i+0] = volobj->texture3d[3*i+1] = volobj->texture3d[3*i+2] = data[i];
			else
				volobj->texture3d[3*i+src_chan] = data[i];
	}

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(dst_buffer)clReleaseMemObject(dst_buffer);
    if(src_buffer)clReleaseMemObject(src_buffer);

	delete[] data;
}
void Convolutioner_FrequencyDomain_OpenCL::process(AudioInOutBuffers<float_type>& audio ) {
    
    //
    unsigned int _2B = audio.channelLength_ * 2;
    unsigned int _B  = audio.channelLength_;
    unsigned int _C  = audio.numOfChannels_;                        //numOfChannels
    unsigned int _P  = partitionedIR_.get_numOfPartsPerChannel();   //numOfIRPartsPerChannel
    //.
    
    //_ if >>>latency<<< or >>>number of channels<<< changed:
    //      set partitionedIR
    //      recreate buffers
    //      recreate fft plans
    if ( window_.get_inputBlockSize() != audio.channelLength_ || window_.get_numOfChannels() != audio.numOfChannels_) {
        
        //Setting partitionedIR
        if (window_.get_inputBlockSize() != audio.channelLength_) {
            
            partitionedIR_.setNewIRF( irf_, audio.channelLength_ );
            _P = partitionedIR_.get_numOfPartsPerChannel();
            
            //Recreate, initialize buffers, and set as kernel arguments: PIR
            //recreate
            bufferPIR_R_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P);
            bufferPIR_I_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P);
            //.
            
            //initialize
            bufferPIR_R_.set(partitionedIR_.real_     );
            bufferPIR_I_.set(partitionedIR_.imaginary_);
            //.
            
            //set as kernel argument
            bufferPIR_R_.setAsKernelArgument(0, complexMultiplyAdd_kernel_);
            bufferPIR_I_.setAsKernelArgument(1, complexMultiplyAdd_kernel_);
            //.
            //.(Recreate...)
            
        }
        //.
        
        //Recreate initialize buffers, and set as kernel arguments: transform, FDL, accumulator
        //recreate
        /****/bufferTransform_R_.recreate(CL_MEM_READ_WRITE,    _2B * _C        );
        /****/bufferTransform_I_.recreate(CL_MEM_READ_WRITE,    _2B * _C        );
        /**********/bufferFDL_R_.recreate(CL_MEM_READ_WRITE,    _2B * _C * _P   );
        /**********/bufferFDL_I_.recreate(CL_MEM_READ_WRITE,    _2B * _C * _P   );
        /**/bufferAccumulator_R_.recreate(CL_MEM_READ_WRITE,    _2B * _C        );
        /**/bufferAccumulator_I_.recreate(CL_MEM_READ_WRITE,    _2B * _C        );
        cpu_bufferAccumulator_R_ = new float_type[_2B * _C ];
        cpu_bufferAccumulator_I_ = new float_type[_2B * _C ];
        //.
        
        //initialize FDL with 0
        bufferFDL_R_.fillWithZero();
        bufferFDL_I_.fillWithZero();
        lastInsertedDelayLineIdx = 0;
        //.
        
        //set as kernel argument
        /**********/bufferFDL_R_.setAsKernelArgument(2, complexMultiplyAdd_kernel_);
        /**********/bufferFDL_I_.setAsKernelArgument(3, complexMultiplyAdd_kernel_);
        /**/bufferAccumulator_R_.setAsKernelArgument(4, complexMultiplyAdd_kernel_);
        /**/bufferAccumulator_I_.setAsKernelArgument(5, complexMultiplyAdd_kernel_);
        //.
        //.(Recreate...)
        
        //Recreate plans
        clFFT_Dim3 dim;
        dim.x = _2B;
        dim.y = 1;
        dim.z = 1;
        fftPlan_ = clFFT_CreatePlan(context_, dim, clFFT_1D, clFFT_SplitComplexFormat, &lastCommandStatus_);
        //.
    }
    
    //update each time bufferGlobalParameters because of incrementing of lastInsertedDelayLineIdx
    /*(_2B, _C, _P, pir_C, FDL_LINE)*/
    cpuData_bufferGlobalParameters_[0] = _2B;
    cpuData_bufferGlobalParameters_[1] = _C;
    cpuData_bufferGlobalParameters_[2] = _P;
    cpuData_bufferGlobalParameters_[3] = irf_->numOfChannels_;
    cpuData_bufferGlobalParameters_[4] = lastInsertedDelayLineIdx;
    
    bufferGlobalParameters_.set(cpuData_bufferGlobalParameters_);
    //.
    
    //Update channelsWindow
    window_.update( audio, /*history size*/ _B );
    //.
    
    //Init >>bufferTransform<<
    bufferTransform_R_.set(window_.buffer_.data_);
    for(unsigned int i = 0; i < _2B * _C; ++i)
        cpu_bufferAccumulator_I_[i]=0;
    bufferTransform_I_.set(cpu_bufferAccumulator_I_);
    //.
    
    //Make fft of bufferTransform
    lastCommandStatus_ = clFFT_ExecutePlannar(  cmdQueue_, fftPlan_, _C, clFFT_Forward,
                                              bufferTransform_R_, bufferTransform_I_,
                                              bufferTransform_R_, bufferTransform_I_,
                                              0, NULL, NULL );
    //.
    
    //Copy bufferTransform into bufferFDL (inserting new delay line) (real and imaginary part)
    clEnqueueCopyBuffer(    cmdQueue_, bufferTransform_R_, bufferFDL_R_,
                        0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type),
                        0, NULL, NULL);
    clEnqueueCopyBuffer(    cmdQueue_, bufferTransform_I_, bufferFDL_I_,
                        0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type),
                        0, NULL, NULL);
    //.
    
    //Increment host lastInsertedDelayLine
    lastInsertedDelayLineIdx = (lastInsertedDelayLineIdx + 1 ) % _P;
    //.
    
    //Execute kernel
    size_t globalWorkSize[1];
    globalWorkSize[0] =  _2B * _C /* == window_.get_allLength() */;
    lastCommandStatus_ = clEnqueueNDRangeKernel(cmdQueue_, complexMultiplyAdd_kernel_, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    if(lastCommandStatus_ == -4) {
        std::cout << "Too much amount of memory must be allocated on the GPU due to lenght of impulse response and number of channels.";
        throw int();
    }
    else if(lastCommandStatus_ != 0) {
        std::cout << "Error while sending clEnqueueNDRangeKernel.";
        throw int();
    }
    //.
    
    //ifft of bufferAccumulator
    lastCommandStatus_ = clFFT_ExecutePlannar(  cmdQueue_, fftPlan_, _C, clFFT_Inverse,
                                              bufferAccumulator_R_, bufferAccumulator_I_,
                                              bufferAccumulator_R_, bufferAccumulator_I_,
                                              0, NULL, NULL );
    //.
    
    //Copy from bufferAccumulator to cpu
    bufferAccumulator_R_.get(cpu_bufferAccumulator_R_);
    //.
    
    //Flushing and finishing
    clFlush(cmdQueue_);
    clFinish(cmdQueue_);
    //.
    
    //Write fftw vector form to audio.outputChannel[number of Channel]
    for (unsigned int channNum = 0; channNum < _C; ++channNum)
        for (unsigned sampleNum = 0; sampleNum < _B; ++sampleNum)
            audio.out_[channNum][sampleNum] = (cpu_bufferAccumulator_R_[channNum*_2B + _B + sampleNum])/_2B;
    //.
}
Esempio n. 30
0
int crackMD5(char *hash, char *cs, int passlen) {

	clut_device dev;	// device struct
	cl_event  evt;      // performance measurement event
	cl_kernel kernel;	// execution kernel
	cl_int ret;			// error code

	double td;
	int cs_len, sync_flag;
	long chunk, disp;
	unsigned char bin_hash[HASH_SIZE];

	cs_len = strlen(cs);
	sync_flag = 0;
	strToBin(hash, bin_hash, 2*HASH_SIZE);

	disp = DISPOSITIONS(cs_len, passlen);
	chunk = DISP_PER_CORE(disp, AVAILABLE_THREADS);

	debug("HOST", "Numero di disposizione da calcolare per stream processing unit = %lu\n", chunk);

	clut_open_device(&dev, PATH_TO_KERNEL);
	clut_print_device_info(&dev);


	/* ----------------------------------------- Create execution kernel ----------------------------------------- */
	kernel = clCreateKernel(dev.program, KERNEL_NAME, &ret);
	clut_check_err(ret, "Fallita la creazione del kernel");


	/* ----------------------------------- Create memory buffers on the device ----------------------------------- */
	cl_mem dchunk = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, sizeof(long), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del chunk");

	cl_mem dhash = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, HASH_SIZE * sizeof(unsigned char), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione dell'hash");

	cl_mem charset = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, cs_len * sizeof(char), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del charset");

	cl_mem charset_size = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della taglia del charset");

	cl_mem dpasslen = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della taglia del charset");

	//cl_mem sync = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, AVAILABLE_CORES * sizeof(int), NULL, &ret);
	cl_mem sync = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del flag di sync");

	cl_mem dcracked = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, HASH_SIZE, NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della password in chiaro");

	cl_mem computed_hash = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, HASH_SIZE * sizeof(unsigned char), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della password in chiaro");


	/* ----------------------------------- Write memory buffers on the device ------------------------------------ */
	ret = clEnqueueWriteBuffer(dev.queue, dchunk, CL_TRUE, 0, sizeof(long), &chunk, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura del chunk sul buffer di memoria del device");

	ret = clEnqueueWriteBuffer(dev.queue, dhash, CL_TRUE, 0, HASH_SIZE * sizeof(unsigned char), (int *)bin_hash, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura dell'hash sul buffer di memoria del device");

	ret = clEnqueueWriteBuffer(dev.queue, charset, CL_TRUE, 0, cs_len * sizeof(char), cs, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura del charset sul buffer di memoria del device");

	ret = clEnqueueWriteBuffer(dev.queue, charset_size, CL_TRUE, 0, sizeof(int), &cs_len, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device");

	ret = clEnqueueWriteBuffer(dev.queue, dpasslen, CL_TRUE, 0, sizeof(int), &passlen, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device");

	//ret = clEnqueueWriteBuffer(dev.queue, sync, CL_TRUE, 0, AVAILABLE_CORES * sizeof(int), &sync_flag, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(dev.queue, sync, CL_TRUE, 0, sizeof(int), &sync_flag, 0, NULL, NULL);
		if(ret)
		   clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device");

	/* --------------------------------- Set the arguments to our compute kernel --------------------------------- */
	ret  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &dchunk);
	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dhash);
	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &charset);
	ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charset_size);
	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dpasslen);
	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &sync);
	ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &dcracked);
	ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &computed_hash);
	clut_check_err(ret, "Fallito il setting degli argomenti del kernel");


	/* ---------------------------------------- Execute the OpenCL kernel ---------------------------------------- */
	size_t global_dim[] = { AVAILABLE_THREADS };
	ret = clEnqueueNDRangeKernel(dev.queue, kernel, 1, NULL, global_dim, NULL, 0, NULL, &evt);
	if(ret)
	   clut_check_err(ret, "Fallita l'esecuzione del kernel");


	/* -------------------------- Read the device memory buffer to the local variable ---------------------------- */
	//int found[80];
	int found;
	int digest[HASH_SIZE/sizeof(int)];
	char *password = (char *) malloc(passlen * sizeof(char) + 1);
	memset(password, 0, passlen * sizeof(char) + 1);
	//memset(found, 0, AVAILABLE_CORES * sizeof(int));

	//ret = clEnqueueReadBuffer(dev.queue, sync, CL_TRUE, 0, AVAILABLE_CORES * sizeof(int), found, 0, NULL, NULL);
	ret = clEnqueueReadBuffer(dev.queue, sync, CL_TRUE, 0, sizeof(int), &found, 0, NULL, NULL);
	if(ret)
	   clut_check_err(ret, "Fallimento nel leggere se la password e' stata trovata con successo");
	debug("HOST", "La password e' stata trovata dal kernel OpenCL? ");

	/*int i;
	for(i=0; i<AVAILABLE_CORES; i++){
		printf(" %d ", found[i]);
	}
	printf("\n");*/

	if(found){
	   ret = clEnqueueReadBuffer(dev.queue, dcracked, CL_TRUE, 0, HASH_SIZE, digest, 0, NULL, NULL);
	   if(ret)
	      clut_check_err(ret, "Fallimento nel leggere la password");
	   printf("Si. Password: %s\n", (char *)digest);
	}
	else
		printf("No.\n");

	/* ------------------------------------- Return kernel execution time ---------------------------------------- */
	td = clut_get_duration(evt);
	debug("HOST","Kernel duration: %f secs\n", td);

	/* ----------------------------------------------- Clean up -------------------------------------------------- */
	ret  = clReleaseKernel(kernel);
	ret |= clReleaseMemObject(dchunk);
	ret |= clReleaseMemObject(dhash);
	ret |= clReleaseMemObject(charset);
	ret |= clReleaseMemObject(charset_size);
	ret |= clReleaseMemObject(dpasslen);
	ret |= clReleaseMemObject(sync);
	ret |= clReleaseMemObject(dcracked);
	ret |= clReleaseMemObject(computed_hash);
	clut_check_err(ret, "Rilascio di risorse fallito");

	clFinish(dev.queue);

	clut_close_device(&dev);

	return 0;
}