Beispiel #1
0
    /// Enqueues a command to fill \p buffer with \p pattern.
    ///
    /// \see_opencl_ref{clEnqueueFillBuffer}
    ///
    /// \opencl_version_warning{1,2}
    ///
    /// \see fill()
    event enqueue_fill_buffer(const buffer &buffer,
                              const void *pattern,
                              size_t pattern_size,
                              size_t offset,
                              size_t size,
                              const wait_list &events = wait_list())
    {
        BOOST_ASSERT(m_queue != 0);
        BOOST_ASSERT(offset + size <= buffer.size());
        BOOST_ASSERT(buffer.get_context() == this->get_context());

        event event_;

        cl_int ret = clEnqueueFillBuffer(
            m_queue,
            buffer.get(),
            pattern,
            pattern_size,
            offset,
            size,
            events.size(),
            events.get_event_ptr(),
            &event_.get()
        );

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }

        return event_;
    }
JNIEXPORT jint JNICALL Java_org_lwjgl_opencl_CL12_nclEnqueueFillBuffer(JNIEnv *env, jclass clazz, jlong command_queue, jlong buffer, jlong pattern, jlong pattern_size, jlong offset, jlong size, jint num_events_in_wait_list, jlong event_wait_list, jlong event, jlong function_pointer) {
	const cl_void *pattern_address = (const cl_void *)(intptr_t)pattern;
	const cl_event *event_wait_list_address = (const cl_event *)(intptr_t)event_wait_list;
	cl_event *event_address = (cl_event *)(intptr_t)event;
	clEnqueueFillBufferPROC clEnqueueFillBuffer = (clEnqueueFillBufferPROC)((intptr_t)function_pointer);
	cl_int __result = clEnqueueFillBuffer((cl_command_queue)(intptr_t)command_queue, (cl_mem)(intptr_t)buffer, pattern_address, pattern_size, offset, size, num_events_in_wait_list, event_wait_list_address, event_address);
	return __result;
}
Beispiel #3
0
cl_int zero_cl_mem(clctx_t *c, cl_mem buffer, size_t size)
{
	cl_int ret;
	uint32_t z = 0;

	ret = clEnqueueFillBuffer (c->command_queue, buffer, &z, sizeof(z), 0, size, 0, NULL, NULL);
	CL_ERR_RET("clEnqueueFillBuffer (in zero_cl_mem)", ret);

	return ret;
}
void oskar_mem_clear_contents(oskar_Mem* mem, int* status)
{
    size_t size;

    /* Check if safe to proceed. */
    if (*status) return;

    /* Compute the size. */
    size = mem->num_elements * oskar_mem_element_size(mem->type);

    /* Clear the memory. */
    if (mem->location == OSKAR_CPU)
    {
        memset(mem->data, 0, size);
    }
    else if (mem->location == OSKAR_GPU)
    {
#ifdef OSKAR_HAVE_CUDA
        cudaMemset(mem->data, 0, size);
#else
        *status = OSKAR_ERR_CUDA_NOT_AVAILABLE;
#endif
    }
    else if (mem->location & OSKAR_CL)
    {
#ifdef OSKAR_HAVE_OPENCL
        cl_event event;
        cl_int error;
        char zero = '\0';
        error = clEnqueueFillBuffer(oskar_cl_command_queue(),
                mem->buffer, &zero, sizeof(char), 0, size, 0, NULL, &event);
        clWaitForEvents(1, &event); /* This is required. */
        if (error != CL_SUCCESS)
        {
            fprintf(stderr, "clEnqueueFillBuffer() error (%d)\n", error);
            *status = OSKAR_ERR_INVALID_ARGUMENT;
        }
#else
        *status = OSKAR_ERR_OPENCL_NOT_AVAILABLE;
#endif
    }
    else
    {
        *status = OSKAR_ERR_BAD_LOCATION;
    }
}
	static void clwClearOutOrPart(int p_n_layers, int* p_n_neurons, std::vector<cl_mem> p_buffers)
	{
		clwInitLib();
		char pattern = 0;
		cl_int res;
		for (int i=1; i<p_n_layers; i++)
		{
			res = clEnqueueFillBuffer(g_cl_Command_Queue, p_buffers[i], &pattern, 1, 0, sizeof(real) * (p_n_neurons[i]-1), 0, nullptr, nullptr);
			if (res != CL_SUCCESS) {std::cerr<<"clCrearOutOrPart :"<<res<<std::endl; getchar(); }
		}
		#ifdef FINISH
		res = clFinish(g_cl_Command_Queue);
		#else
		res = clEnqueueBarrierWithWaitList(g_cl_Command_Queue, 0, nullptr, nullptr);
		#endif
		if (res != CL_SUCCESS) {std::cerr<<"clCrearOutOrPart :"<<res<<std::endl; getchar(); }
	}
/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
 */
static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
{
    if (natoms_clear == 0)
    {
        return;
    }

    cl_int gmx_used_in_debug cl_error;

    cl_atomdata_t           *atomData = nb->atdat;
    cl_command_queue         ls       = nb->stream[eintLocal];
    cl_float                 value    = 0.0f;

    cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
                                   0, natoms_clear*sizeof(rvec), 0, nullptr, nullptr);
    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("nbnxn_ocl_clear_f failed: " +
                                                ocl_get_error_string(cl_error)).c_str());
}
Beispiel #7
0
static int enqueue_zero_buffer(cl_command_queue queue,
                               cl_mem buffer,
                               size_t size,
                               cl_uint num_events_in_wait_list,
                               const cl_event *event_wait_list,
                               cl_event *event,
                               cl_int *err)
{
    cl_int _err;
    cl_uchar c = 0;

    if (!err) err = &_err;

    *err = clEnqueueFillBuffer(queue, (cl_mem)buffer, &c, sizeof(c), 0,
                               size, num_events_in_wait_list, event_wait_list,
                               event);
    CHECK_CL_ERROR(*err);

    return 0;
error:
    return -1;
}
Beispiel #8
0
int
main(void)
{
  cl_int err;
  cl_platform_id platforms[MAX_PLATFORMS];
  cl_uint nplatforms;
  cl_device_id devices[MAX_DEVICES];
  cl_uint ndevices;
  cl_uint i, j;
  size_t el, row, col;

  CHECK_CL_ERROR(clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms));

  for (i = 0; i < nplatforms; i++)
  {
    CHECK_CL_ERROR(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, MAX_DEVICES,
      devices, &ndevices));

    /* Only test the devices we actually have room for */
    if (ndevices > MAX_DEVICES)
      ndevices = MAX_DEVICES;

    for (j = 0; j < ndevices; j++)
    {
      /* skip devices that do not support images */
      cl_bool has_img;
      CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE_SUPPORT, sizeof(has_img), &has_img, NULL));
      if (!has_img)
        continue;

      cl_context context = clCreateContext(NULL, 1, &devices[j], NULL, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clCreateContext");
      cl_command_queue queue = clCreateCommandQueue(context, devices[j], 0, &err);
      CHECK_OPENCL_ERROR_IN("clCreateCommandQueue");

      cl_ulong alloc;
      size_t max_height;
      size_t max_width;
#define MAXALLOC (1024U*1024U)

      CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE,
          sizeof(alloc), &alloc, NULL));
      CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_WIDTH,
          sizeof(max_width), &max_width, NULL));
      CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_HEIGHT,
          sizeof(max_height), &max_height, NULL));


      while (alloc > MAXALLOC)
        alloc /= 2;

      // fit at least one max_width inside the alloc (shrink max_width for this)
      while (max_width*pixel_size > alloc)
        max_width /= 2;

      // round number of elements to next multiple of max_width elements
      const size_t nels = (alloc/pixel_size/max_width)*max_width;
      const size_t buf_size = nels*pixel_size;

      cl_image_desc img_desc;
      memset(&img_desc, 0, sizeof(img_desc));
      img_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
      img_desc.image_width = max_width;
      img_desc.image_height = nels/max_width;
      img_desc.image_depth = 1;

      cl_ushort null_pixel[4] = {0, 0, 0, 0};
      cl_ushort *host_buf = malloc(buf_size);
      TEST_ASSERT(host_buf);

      for (el = 0; el < nels; el+=4) {
        host_buf[el] = el & CHANNEL_MAX;
        host_buf[el+1] = (CHANNEL_MAX - el) & CHANNEL_MAX;
        host_buf[el+2] = (CHANNEL_MAX/((el & 1) + 1) - el) & CHANNEL_MAX;
        host_buf[el+3] = (CHANNEL_MAX - el/((el & 1) + 1)) & CHANNEL_MAX;
      }

      cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clCreateBuffer");
      cl_mem img = clCreateImage(context, CL_MEM_READ_WRITE, &img_format, &img_desc, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clCreateImage");

      CHECK_CL_ERROR(clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, buf_size, host_buf, 0, NULL, NULL));

      const size_t offset = 0;
      const size_t origin[] = {0, 0, 0};
      const size_t region[] = {img_desc.image_width, img_desc.image_height, 1};

      CHECK_CL_ERROR(clEnqueueCopyBufferToImage(queue, buf, img, offset, origin, region, 0, NULL, NULL));

      size_t row_pitch, slice_pitch;
      cl_ushort *img_map = clEnqueueMapImage(queue, img, CL_TRUE, CL_MAP_READ, origin, region,
        &row_pitch, &slice_pitch, 0, NULL, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clEnqueueMapImage");

      CHECK_CL_ERROR(clFinish(queue));

      for (row = 0; row < img_desc.image_height; ++row) {
        for (col = 0; col < img_desc.image_width; ++col) {
          cl_ushort *img_pixel = (cl_ushort*)((char*)img_map + row*row_pitch) + col*4;
          cl_ushort *buf_pixel = host_buf + (row*img_desc.image_width + col)*4;

          if (memcmp(img_pixel, buf_pixel, pixel_size) != 0)
            printf("%zu %zu %zu : %x %x %x %x | %x %x %x %x\n",
              row, col, (size_t)(buf_pixel - host_buf),
              buf_pixel[0],
              buf_pixel[1],
              buf_pixel[2],
              buf_pixel[3],
              img_pixel[0],
              img_pixel[1],
              img_pixel[2],
              img_pixel[3]);

          TEST_ASSERT(memcmp(img_pixel, buf_pixel, pixel_size) == 0);

        }
      }

      CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, img, img_map, 0, NULL, NULL));

      /* Clear the buffer, and ensure it has been cleared */
      CHECK_CL_ERROR(clEnqueueFillBuffer(queue, buf, null_pixel, sizeof(null_pixel), 0, buf_size, 0, NULL, NULL));
      cl_ushort *buf_map = clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, buf_size, 0, NULL, NULL, &err);
      CHECK_OPENCL_ERROR_IN("clEnqueueMapBuffer");

      CHECK_CL_ERROR(clFinish(queue));

      for (el = 0; el < nels; ++el) {
#if 0 // debug
        if (buf_map[el] != 0) {
          printf("%zu/%zu => %u\n", el, nels, buf_map[el]);
        }
#endif
        TEST_ASSERT(buf_map[el] == 0);
      }

      CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, buf, buf_map, 0, NULL, NULL));

      /* Copy data from image to buffer, and check that it's again equal to the original buffer */
      CHECK_CL_ERROR(clEnqueueCopyImageToBuffer(queue, img, buf, origin, region, offset, 0, NULL, NULL));
      buf_map = clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, buf_size, 0, NULL, NULL, &err);
      CHECK_CL_ERROR(clFinish(queue));

      TEST_ASSERT(memcmp(buf_map, host_buf, buf_size) == 0);

      CHECK_CL_ERROR (
          clEnqueueUnmapMemObject (queue, buf, buf_map, 0, NULL, NULL));
      CHECK_CL_ERROR (clFinish (queue));

      free(host_buf);
      CHECK_CL_ERROR (clReleaseMemObject (img));
      CHECK_CL_ERROR (clReleaseMemObject (buf));
      CHECK_CL_ERROR (clReleaseCommandQueue (queue));
      CHECK_CL_ERROR (clReleaseContext (context));
    }
  }
  return EXIT_SUCCESS;
}
void LSHReservoirSampler::initHelper(int numTablesIn, int numHashPerFamilyIn, int reservoriSizeIn) {

	/* Reservoir Random Number. */
	std::cout << "[LSHReservoirSampler::initHelper] Generating random number for reservoir sampling ..." << std::endl;
	std::default_random_engine generator1;
	std::uniform_int_distribution<unsigned int> distribution_a(0, 0x7FFFFFFF);
	_sechash_a = distribution_a(generator1) * 2 + 1;
	std::uniform_int_distribution<unsigned int> distribution_b(0, 0xFFFFFFFF >> _numSecHash);
	_sechash_b = distribution_b(generator1);

	_global_rand = new unsigned int[_maxReservoirRand];
	for (unsigned int i = 0; i < _maxReservoirRand; i++) {
		std::uniform_int_distribution<unsigned int> distribution(0, i);
		_global_rand[i] = distribution(generator1);
	}
#if defined OPENCL_HASHTABLE
	_globalRand_obj = clCreateBuffer(context_gpu, CL_MEM_READ_WRITE,
		_maxReservoirRand * sizeof(unsigned int), NULL, &_err);
	_err = clEnqueueWriteBuffer(command_queue_gpu, _globalRand_obj, CL_TRUE, 0,
		_maxReservoirRand * sizeof(unsigned int), _global_rand, 0, NULL, NULL);
#endif
	std::cout << "Completed. " << std::endl;
	
	/* Hash tables. */
	_tableMemReservoirMax = (_numTables - 1) * _aggNumReservoirs + _numReservoirsHashed;
	_tableMemMax = _tableMemReservoirMax * (1 + _reservoirSize);
	_tablePointerMax = _numTables * _numReservoirsHashed;
#if defined OPENCL_HASHTABLE
	std::cout << "Initializing GPU-OpenCL tables and pointers ...  " << std::endl;
	_tableMem_obj = clCreateBuffer(context_gpu, CL_MEM_READ_WRITE,
		_tableMemMax * sizeof(unsigned int), NULL, &_err);
	clCheckError(_err, "[initHelper] Failed to alloc GPU _tableMem_obj.");
	_err = clEnqueueFillBuffer(command_queue_gpu, _tableMem_obj, &_zero, sizeof(const int), 0,
		_tableMemMax * sizeof(unsigned int), 0, NULL, NULL);
	clCheckError(_err, "[initHelper] Failed to init GPU _tableMem_obj.");

	_tableMemAllocator_obj = clCreateBuffer(context_gpu, CL_MEM_READ_WRITE,
		_numTables * sizeof(unsigned int), NULL, &_err);
	clCheckError(_err, "[initHelper] Failed to alloc GPU _tableMemAllocator_obj.");
	_err = clEnqueueFillBuffer(command_queue_gpu, _tableMemAllocator_obj, &_zero, sizeof(const int), 0,
		_numTables * sizeof(unsigned int), 0, NULL, NULL);
	clCheckError(_err, "[initHelper] Failed to init GPU _tableMemAllocator_obj.");

	_tablePointers_obj = clCreateBuffer(context_gpu, CL_MEM_READ_WRITE,
		_tablePointerMax * sizeof(unsigned int), NULL, &_err);
	clCheckError(_err, "[initHelper] Failed to alloc GPU _tablePointers_obj.");
	_err = clEnqueueFillBuffer(command_queue_gpu, _tablePointers_obj, &_tableNull, sizeof(const int), 0,
		_tablePointerMax * sizeof(unsigned int), 0, NULL, NULL);
	clCheckError(_err, "[initHelper] Failed to init GPU _tablePointers_obj.");

	clFinish(command_queue_gpu);
	std::cout << "Completed. \n";
#elif defined CPU_TB
	std::cout << "Initializing CPU tables and pointers ... " << std::endl;
	_tableMem = new unsigned int[_tableMemMax]();
	_tableMemAllocator = new unsigned int[_numTables]();
	_tablePointers = new unsigned int[_tablePointerMax];
	_tablePointersLock = new omp_lock_t[_tablePointerMax];
	std::cout << "Completed. " << std::endl;
	std::cout << "Initializing CPU tablePointers/Locks ... " << std::endl;
	for (unsigned long long i = 0; i < _tablePointerMax; i++) {
		_tablePointers[i] = TABLENULL;
		omp_init_lock(_tablePointersLock + i);
	}
	std::cout << "Completed. " << std::endl;
	std::cout << "Initializing CPU tableCountersLocks ... " << std::endl;
	_tableCountersLock = new omp_lock_t[_tableMemReservoirMax]; 
	for (unsigned long long i = 0; i < _tableMemReservoirMax; i++) {
		omp_init_lock(_tableCountersLock + i);
	}
	std::cout << "Completed. " << std::endl;
#endif

	/* Hashing counter. */
	_sequentialIDCounter_kernel = 0;
}
Beispiel #10
0
int main(int argc, char **argv) {


	int iter;
	int total_number_sequences = 0;
	int total_number_targets = 0;
	int total_alignments = 0;

	int sequence_index_start = 0;
	int sequence_index_end = 0;
	int target_index_start = 0;
	int target_index_end = 0;


#ifdef PROFILING
	struct timeval startt, endt, startttotal, endttotal, starttiter, endtiter,
	timer_iter_total, timer_mm, timer_init, timer_H2D, timer_D2H, timer_kernel1, timer_kernel2, timer_plotAlignments, timer_kernel_builder,
	timer_total;

	struct timeval timer_iter_total_array[ITERATIONS];
	struct timeval timer_mm_array[ITERATIONS];
	struct timeval timer_init_array[ITERATIONS];
	struct timeval timer_H2D_array[ITERATIONS];
	struct timeval timer_D2H_array[ITERATIONS];
	struct timeval timer_kernel1_array[ITERATIONS];
	struct timeval timer_kernel2_array[ITERATIONS];
	struct timeval timer_plotAlignments_array[ITERATIONS];
	struct timeval timer_kernel_builder_array[ITERATIONS];

	timer_total.tv_usec = 0;
	timer_total.tv_sec = 0;
#endif

#ifdef PROFILING
	gettimeofday(&startttotal, NULL);
#endif
	for(iter=0; iter<ITERATIONS; iter++) {

		total_number_sequences = 0;
		total_number_targets = 0;

#ifdef PROFILING
		timer_iter_total.tv_usec = 0;
		timer_iter_total.tv_sec = 0;

		timer_mm.tv_usec = 0;
		timer_mm.tv_sec = 0;

		timer_init.tv_usec = 0;
		timer_init.tv_sec = 0;

		timer_H2D.tv_usec = 0;
		timer_H2D.tv_sec = 0;

		timer_D2H.tv_usec = 0;
		timer_D2H.tv_sec = 0;

		timer_kernel1.tv_usec = 0;
		timer_kernel1.tv_sec = 0;

		timer_kernel2.tv_usec = 0;
		timer_kernel2.tv_sec = 0;

		timer_plotAlignments.tv_usec = 0;
		timer_plotAlignments.tv_sec = 0;

		timer_kernel_builder.tv_usec = 0;
		timer_kernel_builder.tv_sec = 0;
#endif

#ifdef PROFILING
		gettimeofday(&starttiter, NULL);
#endif
		/** We do not have 11 arguments **/
		if (argc != 10) {
			fprintf(stderr,"Error: use: ./paswas_opencl <sequenceFile> <targetFile> <superBlocksX> <superBlocksY> <sequence_index_start> <sequence_index_end> <target_index_start> <target_index_end> <performanceFileLoc>!\n");
			exit(EXIT_FAILURE);
		}

		cl_platform_id platforms = NULL;
		cl_uint ret_num_platforms = 0;
		cl_uint ret_num_devices = 0;
		cl_device_id devices = NULL;
		cl_command_queue queue = NULL;
		cl_context context = NULL;
		cl_program program = NULL;
		cl_kernel kernel_calculateScore = NULL;
		cl_kernel kernel_traceback = NULL;
		cl_int error_check;

		error_check = clGetPlatformIDs(1, &platforms, &ret_num_platforms);
		if(error_check != CL_SUCCESS) {
			fprintf(stderr,"Could not find a valid OpenCL platform\n");
			exit(EXIT_FAILURE);
		}

		sequence_index_start = atoi(argv[5]);
		if(sequence_index_start<0){
			fprintf(stderr,"Please provide a valid sequence start index:%d\n",sequence_index_start);
			exit(EXIT_FAILURE);
		}
		fprintf(stderr,"sequence_index_start:%d\n",sequence_index_start);

		sequence_index_end = atoi(argv[6]);
		if(sequence_index_end<=sequence_index_start){
			fprintf(stderr,"Please provide a valid sequence end index:%d\n",sequence_index_end);
			exit(EXIT_FAILURE);
		}
		fprintf(stderr,"sequence_index_end:%d\n",sequence_index_end);

		target_index_start = atoi(argv[7]);
		if(target_index_start<0){
			fprintf(stderr,"Please provide a valid target start index:%d\n",target_index_start);
			exit(EXIT_FAILURE);
		}
		fprintf(stderr,"target_index_start:%d\n",target_index_start);

		target_index_end = atoi(argv[8]);
		if(target_index_end<=target_index_start){
			fprintf(stderr,"Please provide a valid target end index:%d\n",target_index_end);
			exit(EXIT_FAILURE);
		}
		fprintf(stderr,"target_index_end:%d\n",target_index_end);


	#ifdef NVIDIA
		error_check = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, &ret_num_devices);
	#endif
	#ifdef AMD_GPU
		error_check = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, &ret_num_devices);
	#endif
	#ifdef AMD_CPU
		error_check = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_CPU, 1, &devices, &ret_num_devices);
	#endif
	#ifdef INTEL
		error_check = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_CPU, 1, &devices, &ret_num_devices);
	#endif

		if(error_check != CL_SUCCESS) {
				fprintf(stderr,"No OpenCL devices found\n");
				exit(EXIT_FAILURE);
		}

		cl_context_properties properties[] = {
				CL_CONTEXT_PLATFORM, (cl_context_properties) platforms, 0
		};

		context = clCreateContext(properties, 1, &devices, &pfn_notify, NULL, &error_check);
		if(error_check != CL_SUCCESS) {
				fprintf(stderr,"Context could not be created\n");
				exit(EXIT_FAILURE);
		}

		queue = clCreateCommandQueue(context, devices, 0, &error_check);
		if(error_check != CL_SUCCESS) {
				fprintf(stderr,"Command queue could not be created");
				exit(EXIT_FAILURE);
		}

		dimensions superBlocks;
		superBlocks.x = atoi(argv[3]);
		superBlocks.y = atoi(argv[4]);
		if(!superBlocks.x || !superBlocks.y) {
			fprintf(stderr,"Please provide integer values for superblock_x or superblock_y\n");
		}
		superBlocks.z = 0;
		// variables needed for the application:

		fprintf(stderr,"Superblocksx: %d\tSuperblocksy:%d\n",superBlocks.x,superBlocks.y);
		fprintf(stderr,"#SEQUENCES: %d\t#TARGET:%d\n",NUMBER_SEQUENCES,NUMBER_TARGETS);

#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif

		char *descSequences = (char *) malloc(sizeof(char) * superBlocks.x*NUMBER_SEQUENCES*MAX_LINE_LENGTH);
		char *descTargets = (char *) malloc(sizeof(char) * superBlocks.y* NUMBER_TARGETS*MAX_LINE_LENGTH);
		descSequences[0] = '\0';
		descTargets[0] = '\0';

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif

		float h_scoringsMatrix[SCORINGS_MAT_SIZE*SCORINGS_MAT_SIZE] = {0};
		fillScoringsMatrix(h_scoringsMatrix);

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_init.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

		int min_align;
		error_check = clGetDeviceInfo(devices, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &min_align, NULL);
		if(error_check != CL_SUCCESS) {
			fprintf(stderr,"GetDeviceInfo failed");
			exit(EXIT_FAILURE);
		}
		fprintf(stderr, "ALIGN = %d\n", min_align);

#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif

#ifdef NO_ZERO_COPY
		char *h_sequences = 0, *h_targets = 0;
		cl_mem d_sequences, d_targets, d_matrix, d_globalMaxima, d_globalDirection;
		cl_mem d_indexIncrement, d_scoringsMatrix;
		GlobalMatrix *h_matrix = 0;
		GlobalMaxima *h_globalMaxima = 0;
		GlobalDirection *h_globalDirectionZeroCopy = 0;
		StartingPoints *h_startingPointsZeroCopy = 0;
		float *h_maxPossibleScoreZeroCopy = 0;
		cl_mem d_startingPointsZeroCopy, d_maxPossibleScoreZeroCopy;

		init(&h_sequences, &h_targets,
			 &d_sequences, &d_targets,
			 &h_matrix, &d_matrix,
				&h_globalMaxima, &d_globalMaxima,
				&d_globalDirection, &h_globalDirectionZeroCopy,
				&d_startingPointsZeroCopy, &h_startingPointsZeroCopy,
				&d_maxPossibleScoreZeroCopy, &h_maxPossibleScoreZeroCopy,
				&d_scoringsMatrix,
				&d_indexIncrement,
				superBlocks,
				context,
				queue,
				error_check);
#endif

		/**Data Structure Initialization with NVIDIA_zero_copy **/
#ifdef NVIDIA_ZERO_COPY
			char *h_sequences = 0, *h_targets = 0;
			cl_mem d_sequences, d_targets, d_matrix, d_globalMaxima, d_globalDirection;
			cl_mem d_indexIncrement, d_scoringsMatrix;
			GlobalMatrix *h_matrix = 0;
			GlobalMaxima *h_globalMaxima = 0;
			StartingPoints *h_startingPointsZeroCopy = 0;
			float *h_maxPossibleScoreZeroCopy = 0;
			GlobalDirection *h_globalDirectionZeroCopy = 0;
			cl_mem pinned_startingPointsZeroCopy, pinned_maxPossibleScoreZeroCopy, pinned_globalDirectionZeroCopy;
			cl_mem d_startingPointsZeroCopy, d_maxPossibleScoreZeroCopy;

			// allocate memory on host & device:
			init_zc(&h_sequences, &h_targets, 
					&d_sequences, &d_targets,
				&h_matrix, &d_matrix,
				&h_globalMaxima, &d_globalMaxima,
				&d_startingPointsZeroCopy, &h_startingPointsZeroCopy, &pinned_startingPointsZeroCopy,
				&d_maxPossibleScoreZeroCopy, &h_maxPossibleScoreZeroCopy, &pinned_maxPossibleScoreZeroCopy,
				&d_globalDirection, &h_globalDirectionZeroCopy, &pinned_globalDirectionZeroCopy,
				&d_scoringsMatrix,
				&d_indexIncrement,
				superBlocks,
				context, queue,
				error_check);
#endif

			/**Data Structure Initialization with INTEL_CPU_zero_copy **/
#ifdef INTEL_ZERO_COPY
			char *h_sequences = 0, *h_targets = 0;
			cl_mem d_sequences, d_targets, d_matrix, d_globalMaxima, d_globalDirection;
			cl_mem d_indexIncrement, d_scoringsMatrix;
			GlobalMatrix *h_matrix = 0;
			GlobalMaxima *h_globalMaxima = 0;
			GlobalDirection *h_globalDirectionZeroCopy = 0;
			StartingPoints *h_startingPointsZeroCopy = 0;
			float *h_maxPossibleScoreZeroCopy = 0;
			cl_mem d_startingPointsZeroCopy, d_maxPossibleScoreZeroCopy;

			/**Zero copy variables base address **/
			StartingPoints* startingPointsData = (StartingPoints*)memalign(min_align/8, sizeof(StartingPoints));
			float* maxPossibleScoreData = (float*)memalign(min_align/8, sizeof(float) * NUMBER_SEQUENCES * superBlocks.x);
			GlobalDirection* globalDirectionData = (GlobalDirection*)memalign(min_align/8, sizeof(GlobalDirection));
			// allocate memory on host & device:
			init_zc_CPU(&h_sequences, &h_targets,
					&d_sequences, &d_targets,
					&h_matrix, &d_matrix,
					&h_globalMaxima, &d_globalMaxima,
					&d_startingPointsZeroCopy, &startingPointsData,
					&d_maxPossibleScoreZeroCopy, &maxPossibleScoreData,
					&d_globalDirection, &globalDirectionData,
					&d_scoringsMatrix,
					&d_indexIncrement,
					superBlocks,
					context,
					queue,
					error_check);
#endif

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

		/**Fill GlobalMatrix with 0's, feature is only available in OpenCL 1.2, however this is not needed for the shared memory case**/
#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif

		#if (defined(INTEL) || defined(AMD)) && (defined(GLOBAL_MEM4))
			float zero_pattern = 0.0;
			error_check = clEnqueueFillBuffer(queue, d_matrix, &zero_pattern, sizeof(float), 0, sizeof(GlobalMatrix), 0, NULL, NULL);
			if(error_check != CL_SUCCESS) {
					fprintf(stderr,"Failed to initialize GlobalMatrix to zero\n");
					exit(EXIT_FAILURE);
			}
			clFinish(queue);
		#endif

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif


#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif

#ifdef GLOBAL_MEM4
		GlobalSemaphores *h_semaphore = (GlobalSemaphores*) malloc(sizeof(GlobalSemaphores));
		cl_mem d_semaphore = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(GlobalSemaphores), NULL, &error_check);
		if(error_check != CL_SUCCESS) {
				fprintf(stderr,"Failed to create semaphore on device\n");
				exit(EXIT_FAILURE);
		}
		/** Initialize device buffer to all zeroes **/
		initSemaphor(&d_semaphore, context, queue, error_check);
#endif

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

		/* Write scoringsMatrix to device buffer*/
#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif

		error_check = clEnqueueWriteBuffer(queue, d_scoringsMatrix, CL_TRUE, 0, sizeof(float) * 26 * 26, h_scoringsMatrix, 0, NULL, NULL);

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

		if(error_check != CL_SUCCESS) {
			fprintf(stderr,"Failed to write scoringsMatrix to device buffer");
			exit(EXIT_FAILURE);
		}

		gzFile targetFile, seqFile;
		kseq_t *target, *seq;

		targetFile = gzopen(argv[2], "r");
		seqFile = gzopen(argv[1], "r");

		if (!targetFile || !seqFile) {
			fprintf(stderr,"Error: could not open target/seq file!\n");
			return 1;
		}



#ifdef INTEL_ZERO_COPY

#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif
		h_maxPossibleScoreZeroCopy = (float *)clEnqueueMapBuffer(queue, d_maxPossibleScoreZeroCopy, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * NUMBER_SEQUENCES * superBlocks.x, 0, NULL, NULL, &error_check);
		clFinish(queue);

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif
		if(error_check != CL_SUCCESS) {
			fprintf(stderr, "clEnqueueMap CL_MAP_WRITE h_maxPossibleScoreZeroCopy => %d\n", error_check);
			exit(EXIT_FAILURE);
		}
#endif



		int t=target_index_start;
		int s=sequence_index_start;
		int target_offset = 0;
		int sequence_offset = 0;
		int l;

#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif
		target = kseq_init(targetFile);
		int skip=0;
		//Skip sequences which we do not need
		while(skip<target_index_start) {
			kseq_read(target);
			skip++;
		}

		while ((l = kseq_read(target)) >= 0 && t<target_index_end) {
			char * current = h_targets+(target_offset*(unsigned int)Y);
			char * currentDesc = descTargets + (target_offset*MAX_LINE_LENGTH);
			current[0] = '\0';
			currentDesc[0] = '\0';
			char * description = (char*) malloc( sizeof(char) * (target->name.l + target->comment.l + target->qual.l + 3));
			description[0] = '\0';

			strcpy(description, target->name.s);
			if(target->comment.l) {
				strcat(description, " ");
				strcat(description, target->comment.s);
			}

			if(target->qual.l) {
				strcat(description, " ");
				strcat(description, target->qual.s);
			}

			if(strlen(description)>=MAX_LINE_LENGTH) {
				description[MAX_LINE_LENGTH-1] = '\0';
			}
			strcpy(currentDesc,description);
			free(description);

			if(target->seq.l > Y) {
				fprintf(stderr, "Error: target read too long!\n");
				fprintf(stderr, "id: %s\n", currentDesc);
				fprintf(stderr, "Y:%d\n", Y);
				fprintf(stderr, "target_file:%s\n",argv[2]);
				return 1;
			}
			strncpy(current, target->seq.s, target->seq.l);

			int i=0;
			for (; i < target->seq.l; i++) {
				current[i] = toupper(current[i]);
				if (current[i] - 'A' < 0 || current[i] - 'A' > 25) {
					fprintf(stderr, "Error: wrong character in target file: '%c', desc: %s\n", current[i], current);
					return 1;
				}
			}
			for(; i < Y; i++) {
				current[i] = FILL_CHARACTER;
			}
			t++;
			target_offset++;
		}

		fprintf(stderr, "Read number of targets: %d from %s\n", target_offset,argv[2]);
		total_number_targets = target_offset;

		for (; target_offset < superBlocks.y * NUMBER_TARGETS; target_offset++) {
			for (int c=0; c < Y; c++)
				*(h_targets+target_offset*Y+c) = FILL_CHARACTER;
		}



		seq = kseq_init(seqFile);
		skip=0;

		//Skip sequences which we do not need
		while(skip<sequence_index_start) {
			kseq_read(seq);
			skip++;
		}

		while ((l = kseq_read(seq)) >= 0 && s<sequence_index_end) {
			h_maxPossibleScoreZeroCopy[sequence_offset] = 0;
			char * current = h_sequences +(sequence_offset*(unsigned int)X);
			char * currentDesc = descSequences + (sequence_offset*MAX_LINE_LENGTH);
			current[0] = '\0';
			currentDesc[0] = '\0';
			char * description = (char*) malloc( sizeof(char) * (seq->name.l + seq->comment.l + seq->qual.l + 3));
			description[0] = '\0';

			strcpy(description, seq->name.s);
			if(seq->comment.l) {
				strcat(description, " ");
				strcat(description, seq->comment.s);
			}
			if(seq->qual.l) {
				strcat(description, " ");
				strcat(description, seq->qual.s);
			}

			if(strlen(description)>=MAX_LINE_LENGTH) {
				description[MAX_LINE_LENGTH-1] = '\0';
			}

			strcpy(currentDesc,description);
			free(description);

			if(seq->seq.l > X) {
				fprintf(stderr, "Error: sequence read too long!\n");
				fprintf(stderr, "id: %s\n", currentDesc);
				fprintf(stderr, "X:%d\n", X);
				fprintf(stderr, "sequence_file:%s\n",argv[1]);
				return 1;
			}
			strncpy(current, seq->seq.s, seq->seq.l);

			int i=0;
			for (; i < seq->seq.l; i++) {
				current[i] = toupper(current[i]);
				h_maxPossibleScoreZeroCopy[sequence_offset] += HIGHEST_SCORE;
				if (current[i] - 'A' < 0 || current[i] - 'A' > 25) {
					fprintf(stderr, "Error: wrong character in seq file: '%c', desc: %s\n", current[i], current);
					return 1;
				}
			}
			for(; i < X; i++) {
				current[i] = FILL_CHARACTER;
			}
			h_maxPossibleScoreZeroCopy[sequence_offset] *= LOWER_LIMIT_MAX_SCORE;
			s++;
			sequence_offset++;
		}

		fprintf(stderr, "Read number of sequences: %d from %s\n", sequence_offset, argv[1]);
		total_number_sequences = sequence_offset;

		int count = 0;

		for (; sequence_offset < superBlocks.x * NUMBER_SEQUENCES; sequence_offset++) {
			count++;
			h_maxPossibleScoreZeroCopy[sequence_offset] = 0;
			for (int c=0; c < X; c++)
				*(h_sequences+sequence_offset*X+c) = FILL_CHARACTER;
		}



		kseq_destroy(target);
		gzclose(targetFile);
		kseq_destroy(seq);
		gzclose(seqFile);

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_init.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

		/* Write maxPossibleScoreZeroCopy to device buffer*/
#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif

	#if defined(NO_ZERO_COPY) || defined(NVIDIA_ZERO_COPY)
		error_check = clEnqueueWriteBuffer(queue, d_maxPossibleScoreZeroCopy, CL_TRUE, 0, sizeof(float) * NUMBER_SEQUENCES * superBlocks.x, h_maxPossibleScoreZeroCopy, 0, NULL, NULL);
	#endif
	#ifdef INTEL_ZERO_COPY
		error_check = clEnqueueUnmapMemObject(queue, d_maxPossibleScoreZeroCopy, h_maxPossibleScoreZeroCopy, 0, NULL, NULL);
	#endif
		clFinish(queue);

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

		if(error_check != CL_SUCCESS) {
			fprintf(stderr,"Failed to write maxPossibleScoreZeroCopy to buffer");
			exit(EXIT_FAILURE);
		}

#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif

		FILE *fp;
		size_t kernel_size;
		char *source;
		fp = fopen("kernel/smithwaterman_kern.cl", "rb");
		if(fp) {
			/* Set the file pointer to the end of the file */
			fseek(fp, 0, SEEK_END);

			kernel_size = ftell(fp);

			/* Set the file pointer back to the beginning of the file */
			fseek(fp, 0, SEEK_SET);

			source = (char*)malloc(kernel_size+1);
			fread(source, 1, kernel_size, fp);
			source[kernel_size] = '\0';

			fclose(fp);
		} else {
			fprintf(stderr,"Could not read from: kernel/smithwaterman_kern.cl\n");
			exit(EXIT_FAILURE);

		}

		program = clCreateProgramWithSource(context, 1, (const char **)&source, (const size_t *)&kernel_size, &error_check);
		if (error_check != CL_SUCCESS)
		{
			fprintf(stderr,"Could not create a program object with the provided source code");
			exit(EXIT_FAILURE);
		}

		if(source){
			free(source);
		}

		/**TODO Determine size of directives dynamically**/
		int size_directives = 2048;
		char *directives = (char *) malloc(sizeof(char) * size_directives);

#ifdef GLOBAL_MEM4
#ifdef NVIDIA
		sprintf(directives, "-DNVIDIA -DGLOBAL_MEM4 -DNUMBER_SEQUENCES=%d -DNUMBER_TARGETS=%d -DX=%d -DY=%d -DMINIMUM_SCORE=%f -DSHARED_X=%d -DSHARED_Y=%d -DWORKLOAD_X=%d -DWORKLOAD_Y=%d",NUMBER_SEQUENCES,NUMBER_TARGETS,X,Y,MINIMUM_SCORE,SHARED_X,SHARED_Y, WORKLOAD_X, WORKLOAD_Y);
#else
		sprintf(directives, "-DGLOBAL_MEM4 -DNUMBER_SEQUENCES=%d -DNUMBER_TARGETS=%d -DX=%d -DY=%d -DMINIMUM_SCORE=%f -DSHARED_X=%d -DSHARED_Y=%d -DWORKLOAD_X=%d -DWORKLOAD_Y=%d",NUMBER_SEQUENCES,NUMBER_TARGETS,X,Y,MINIMUM_SCORE,SHARED_X,SHARED_Y, WORKLOAD_X, WORKLOAD_Y);
#endif
#endif

#ifdef SHARED_MEM
#ifdef NVIDIA
		sprintf(directives, "-DNVIDIA -DSHARED_MEM -DNUMBER_SEQUENCES=%d -DNUMBER_TARGETS=%d -DX=%d -DY=%d -DMINIMUM_SCORE=%f -DSHARED_X=%d -DSHARED_Y=%d",NUMBER_SEQUENCES,NUMBER_TARGETS,X,Y,MINIMUM_SCORE,SHARED_X,SHARED_Y);
#else
		sprintf(directives, "-DSHARED_MEM -DNUMBER_SEQUENCES=%d -DNUMBER_TARGETS=%d -DX=%d -DY=%d -DMINIMUM_SCORE=%f -DSHARED_X=%d -DSHARED_Y=%d",NUMBER_SEQUENCES,NUMBER_TARGETS,X,Y,MINIMUM_SCORE,SHARED_X,SHARED_Y);
#endif
#endif

		error_check = clBuildProgram(program, 1, &devices,directives, NULL, NULL);
		free(directives);

		if (error_check != CL_SUCCESS)
		{
			size_t length;
			clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
			char* buffer = (char*)malloc(length+1);
			clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, length, buffer, NULL);
			buffer[length] ='\0';
			fprintf(stderr,"Error: Failed to build program executable!\n");
			printf("%s\n", buffer);
			free(buffer);
			exit(EXIT_FAILURE);
		}

#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_kernel_builder.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

		total_alignments = 0;



		for (int i=0; i < superBlocks.y; i++) {
#ifdef PROFILING
			gettimeofday(&startt, NULL);
#endif

			error_check = clEnqueueWriteBuffer(queue, d_targets, CL_TRUE, 0, sizeof(char) * Y*NUMBER_TARGETS, h_targets+(i*NUMBER_TARGETS*Y), 0, NULL, NULL);
			clFinish(queue);

#ifdef PROFILING
			gettimeofday(&endt, NULL);
			timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

			if(error_check != CL_SUCCESS) {
				fprintf(stderr,"Failed to write target data to buffer");
				exit(EXIT_FAILURE);
			}

			for (int j=0;j<superBlocks.x;j++) {
				// copy sequences to the device:
#ifdef PROFILING
				gettimeofday(&startt, NULL);
#endif

				error_check = clEnqueueWriteBuffer(queue, d_sequences, CL_TRUE, 0, sizeof(char)*X*NUMBER_SEQUENCES, h_sequences+(j*NUMBER_SEQUENCES*X), 0, NULL, NULL);
				clFinish(queue);

#ifdef PROFILING
				gettimeofday(&endt, NULL);
				timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif
				if(error_check != CL_SUCCESS) {
					fprintf(stderr,"Failed to write sequence data to buffer");
					exit(EXIT_FAILURE);
				}


				// make sure database-type index is reset:
#ifdef PROFILING
				gettimeofday(&startt, NULL);
#endif

				initZeroCopy(&d_indexIncrement, context, queue, error_check);

#ifdef PROFILING
				gettimeofday(&endt, NULL);
				timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif
				// fill the scorings matrix:
				kernel_calculateScore = clCreateKernel(program, "calculateScore", &error_check);
				if(error_check != CL_SUCCESS) {
					fprintf(stderr,"Failed to create calculate score kernel");
					exit(EXIT_FAILURE);
				}


#ifdef PROFILING
				gettimeofday(&startt, NULL);
#endif

				calculateScoreHost(d_matrix, d_sequences, d_targets, d_globalMaxima, d_globalDirection, d_scoringsMatrix, kernel_calculateScore, queue, error_check);

#ifdef PROFILING
				gettimeofday(&endt, NULL);
				timer_kernel1.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);

#endif


				// create tracebacks and copy information through zero copy to the host:
				kernel_traceback = clCreateKernel(program, "traceback", &error_check);
				if(error_check != CL_SUCCESS) {
					fprintf(stderr,"Failed to create calculate traceback kernel");
					exit(EXIT_FAILURE);
				}

#ifdef PROFILING
				gettimeofday(&startt, NULL);
#endif

#ifdef GLOBAL_MEM4
				tracebackHost(d_matrix, d_globalMaxima, d_globalDirection, d_indexIncrement, d_startingPointsZeroCopy,
						d_maxPossibleScoreZeroCopy, j*NUMBER_SEQUENCES, kernel_traceback, queue, error_check, d_semaphore);
#else
				tracebackHost(d_matrix, d_globalMaxima, d_globalDirection, d_indexIncrement, d_startingPointsZeroCopy,
										d_maxPossibleScoreZeroCopy, j*NUMBER_SEQUENCES, kernel_traceback, queue, error_check);
#endif

#ifdef PROFILING
				gettimeofday(&endt, NULL);
				timer_kernel2.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);

#endif				// get number of alignments:
				unsigned int index[1];

#ifdef PROFILING
				gettimeofday(&startt, NULL);
#endif
				error_check = clEnqueueReadBuffer(queue, d_indexIncrement, CL_TRUE, 0, sizeof(unsigned int), index, 0, NULL, NULL);
				clFinish(queue);
#ifdef PROFILING
				gettimeofday(&endt, NULL);
				timer_D2H.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif
				if(error_check != CL_SUCCESS) {
					fprintf(stderr,"Failed to read from d_indexIncrement");
					exit(EXIT_FAILURE);
				}

				fprintf(stderr, "Number of alignments: %d @ %d in iteration: %d\n", *index, j, iter);				// plot the alignments:

				total_alignments+=index[0];

#ifdef PROFILING
				gettimeofday(&startt, NULL);
#endif

				/**Reading from zeroCopy Buffer**/
#if defined(NO_ZERO_COPY) || defined(NVIDIA_ZERO_COPY)
				error_check = clEnqueueReadBuffer(queue, d_globalDirection, CL_TRUE, 0, sizeof(GlobalDirection), h_globalDirectionZeroCopy, 0, NULL, NULL);
#endif

#ifdef INTEL_ZERO_COPY
				h_globalDirectionZeroCopy = (GlobalDirection *)clEnqueueMapBuffer(queue, d_globalDirection, CL_TRUE, CL_MAP_READ, 0, sizeof(GlobalDirection), 0, NULL, NULL, &error_check);
#endif
				clFinish(queue);

#ifdef PROFILING
				gettimeofday(&endt, NULL);
				timer_D2H.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif
				if(error_check != CL_SUCCESS) {
					fprintf(stderr,"Failed to read from d_globalDirectionZeroCopy");
					exit(EXIT_FAILURE);
				}


#ifdef PROFILING
				gettimeofday(&startt, NULL);
#endif

				/**Reading from zeroCopy Buffer**/
	#if defined(NO_ZERO_COPY) || defined(NVIDIA_ZERO_COPY)
				error_check = clEnqueueReadBuffer(queue, d_startingPointsZeroCopy, CL_TRUE, 0, sizeof(StartingPoints), h_startingPointsZeroCopy, 0, NULL, NULL);
	#endif
	#ifdef INTEL_ZERO_COPY
				h_startingPointsZeroCopy = (StartingPoints *)clEnqueueMapBuffer(queue, d_startingPointsZeroCopy, CL_TRUE, CL_MAP_READ, 0, sizeof(StartingPoints), 0, NULL, NULL, &error_check);
	#endif
				clFinish(queue);

#ifdef PROFILING
				gettimeofday(&endt, NULL);
				timer_D2H.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

				if(error_check != CL_SUCCESS) {
					fprintf(stderr,"Failed to read from d_startingPointsZeroCopy");
					exit(EXIT_FAILURE);
				}

#ifdef PROFILING
				gettimeofday(&startt, NULL);
#endif

				plotAlignments(h_sequences, h_targets, h_globalDirectionZeroCopy, *index, h_startingPointsZeroCopy, j*NUMBER_SEQUENCES, i*NUMBER_TARGETS, descSequences, descTargets);

#ifdef PROFILING
				gettimeofday(&endt, NULL);
				timer_plotAlignments.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif
			}


		}
#ifdef PROFILING
		gettimeofday(&startt, NULL);
#endif
		/** Unique to NVIDIA Zero-Copy **/
	#ifdef NVIDIA_ZERO_COPY
		clEnqueueUnmapMemObject(queue, pinned_maxPossibleScoreZeroCopy, (void*)h_maxPossibleScoreZeroCopy, 0, NULL, NULL);
		clEnqueueUnmapMemObject(queue, pinned_startingPointsZeroCopy, (void*)h_startingPointsZeroCopy, 0, NULL, NULL);
		clEnqueueUnmapMemObject(queue, pinned_globalDirectionZeroCopy, (void*)h_globalDirectionZeroCopy, 0, NULL, NULL);
		clFinish(queue);

		error_check = 0;
		error_check = clReleaseMemObject(d_sequences);
		error_check |= clReleaseMemObject(d_targets);
		error_check |= clReleaseMemObject(d_matrix);
		error_check |= clReleaseMemObject(d_globalMaxima);
		error_check |= clReleaseMemObject(d_globalDirection);
		error_check |= clReleaseMemObject(pinned_startingPointsZeroCopy);
		error_check |= clReleaseMemObject(d_startingPointsZeroCopy);
		error_check |= clReleaseMemObject(pinned_maxPossibleScoreZeroCopy);
		error_check |= clReleaseMemObject(d_maxPossibleScoreZeroCopy);
		error_check |= clReleaseMemObject(pinned_globalDirectionZeroCopy);
		error_check |= clReleaseMemObject(d_scoringsMatrix);
		error_check |= clReleaseMemObject(d_indexIncrement);

		if(error_check!= CL_SUCCESS) {
			fprintf(stderr, "Releasing memory objects has failed in NVIDIA zero copy\n");
			exit(EXIT_FAILURE);
		}

		free(h_sequences);
		free(h_targets);
		free(h_matrix);
		free(h_globalMaxima);
	#endif

		/** Unique to NO Zero-Copy **/
	#ifdef NO_ZERO_COPY
		error_check = 0;
		error_check |= clReleaseMemObject(d_sequences);
		error_check |= clReleaseMemObject(d_targets);
		error_check |= clReleaseMemObject(d_matrix);
		error_check |= clReleaseMemObject(d_globalMaxima);
		error_check |= clReleaseMemObject(d_globalDirection);
		error_check |= clReleaseMemObject(d_startingPointsZeroCopy);
		error_check |= clReleaseMemObject(d_maxPossibleScoreZeroCopy);
		error_check |= clReleaseMemObject(d_scoringsMatrix);
		error_check |= clReleaseMemObject(d_indexIncrement);

		if(error_check!= CL_SUCCESS) {
			fprintf(stderr, "Releasing memory objects has failed in no zero copy\n");
			exit(EXIT_FAILURE);
		}

		free(h_sequences);
		free(h_targets);
		free(h_matrix);
		free(h_globalMaxima);
		free(h_globalDirectionZeroCopy);
		free(h_startingPointsZeroCopy);
		free(h_maxPossibleScoreZeroCopy);
	#endif


		/** Unique to Intel Zero-Copy **/
	#ifdef INTEL_ZERO_COPY
		error_check = 0;
		error_check |= clReleaseMemObject(d_sequences);
		error_check |= clReleaseMemObject(d_targets);
		error_check |= clReleaseMemObject(d_matrix);
		error_check |= clReleaseMemObject(d_globalMaxima);
		error_check |= clReleaseMemObject(d_globalDirection);
		error_check |= clReleaseMemObject(d_startingPointsZeroCopy);
		error_check |= clReleaseMemObject(d_maxPossibleScoreZeroCopy);
		error_check |= clReleaseMemObject(d_scoringsMatrix);
		error_check |= clReleaseMemObject(d_indexIncrement);

		if(error_check!= CL_SUCCESS) {
			fprintf(stderr, "Releasing memory objects has failed in INTEL zero copy\n");
			exit(EXIT_FAILURE);
		}
		free(h_sequences);
		free(h_targets);
		free(h_matrix);
		free(h_globalMaxima);
		free(startingPointsData);
		free(maxPossibleScoreData);
		free(globalDirectionData);

	#endif

#ifdef GLOBAL_MEM4
		free(h_semaphore);
		clReleaseMemObject(d_semaphore);
#endif


		free(descTargets);
		free(descSequences);

		error_check = 0;
		error_check = clReleaseKernel(kernel_calculateScore);
		error_check |= clReleaseKernel(kernel_traceback);
		if(error_check!= CL_SUCCESS) {
			fprintf(stderr, "Could not release kernels\n");
			exit(EXIT_FAILURE);
		}
		error_check = clReleaseProgram(program);
		if(error_check!= CL_SUCCESS) {
			fprintf(stderr, "Could not release program\n");
			exit(EXIT_FAILURE);
		}
		error_check = clReleaseCommandQueue(queue);
		if(error_check!= CL_SUCCESS) {
			fprintf(stderr, "Could not release CommandQueue\n");
			exit(EXIT_FAILURE);
		}
		error_check = clReleaseContext(context);
		if(error_check!= CL_SUCCESS) {
			fprintf(stderr, "Could not release context\n");
			exit(EXIT_FAILURE);
		}




#ifdef PROFILING
		gettimeofday(&endt, NULL);
		timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
#endif

#ifdef PROFILING
		gettimeofday(&endtiter, NULL);
		timer_iter_total.tv_usec += (endtiter.tv_sec*1000000+endtiter.tv_usec) - (starttiter.tv_sec*1000000+starttiter.tv_usec);
#endif

#ifdef PROFILING
		timer_iter_total_array[iter] = timer_iter_total;
		timer_mm_array[iter] = timer_mm;
		timer_init_array[iter] = timer_init;
		timer_H2D_array[iter] = timer_H2D;
		timer_D2H_array[iter] = timer_D2H;
		timer_kernel1_array[iter] = timer_kernel1;
		timer_kernel2_array[iter] = timer_kernel2;
		timer_plotAlignments_array[iter] = timer_plotAlignments;
		timer_kernel_builder_array[iter] = timer_kernel_builder;
#endif

	}

#ifdef PROFILING
	gettimeofday(&endttotal, NULL);
	timer_total.tv_usec += (endttotal.tv_sec*1000000+endttotal.tv_usec) - (startttotal.tv_sec*1000000+startttotal.tv_usec);
#endif

#ifdef PROFILING
	if(exists(argv[9])) {
		FILE *timeFile;
		timeFile = fopen(argv[9],"a+");
		if(timeFile) {
			for(iter=0; iter<ITERATIONS; iter++) {
				//fprintf(timeFile, "%d\t", iter);
				fprintf(timeFile, "%d\t",sequence_index_start);
				fprintf(timeFile, "%d\t",sequence_index_end);
				fprintf(timeFile, "%d\t",total_number_sequences);
				fprintf(timeFile, "%d\t", X);
				fprintf(timeFile, "%d\t", NUMBER_SEQUENCES);
				fprintf(timeFile, "%s\t", argv[3]);
				fprintf(timeFile, "%d\t",target_index_start);
				fprintf(timeFile, "%d\t",target_index_end);
				fprintf(timeFile, "%d\t",total_number_targets);
				fprintf(timeFile, "%d\t", Y);
				fprintf(timeFile, "%d\t", NUMBER_TARGETS);
				fprintf(timeFile, "%s\t", argv[4]);
				fprintf(timeFile, "%ld.%06ld\t",timer_mm_array[iter].tv_usec/1000000, timer_mm_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_init_array[iter].tv_usec/1000000, timer_init_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_H2D_array[iter].tv_usec/1000000, timer_H2D_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_D2H_array[iter].tv_usec/1000000, timer_D2H_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_kernel1_array[iter].tv_usec/1000000, timer_kernel1_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_kernel2_array[iter].tv_usec/1000000, timer_kernel2_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_plotAlignments_array[iter].tv_usec/1000000, timer_plotAlignments_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_iter_total_array[iter].tv_usec/1000000, timer_iter_total_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%d\n",total_alignments);

			}
			fprintf(timeFile, "#Total execution time: %ld.%06ld\n",timer_total.tv_usec/1000000, timer_total.tv_usec%1000000);
			fprintf(timeFile, "\n");
			fclose(timeFile);

		} else{
			fprintf(stderr,"Cannot append to file: %s\n", argv[9]);
			exit(EXIT_FAILURE);
		}

	} else {
		FILE *timeFile;
		timeFile = fopen(argv[9],"a+");
		if(timeFile) {
			fprintf(timeFile, "#Start_s\tEnd_s\tTotal_s\tX\tDevice_s\tSuperblock_s\tStart_t\tEnd_t\tTotal_t\tY\tDevice_t\tSuperblock_t\tMem_Management\tInit\tH2D\tD2H\tkernel1\tkernel2\tPlotAlignments\titer_total\tAlignments\n");
			for(iter=0; iter<ITERATIONS; iter++) {
				fprintf(timeFile, "%d\t",sequence_index_start);
				fprintf(timeFile, "%d\t",sequence_index_end);
				fprintf(timeFile, "%d\t",total_number_sequences);
				fprintf(timeFile, "%d\t", X);
				fprintf(timeFile, "%d\t", NUMBER_SEQUENCES);
				fprintf(timeFile, "%s\t", argv[3]);
				fprintf(timeFile, "%d\t",target_index_start);
				fprintf(timeFile, "%d\t",target_index_end);
				fprintf(timeFile, "%d\t",total_number_targets);
				fprintf(timeFile, "%d\t", Y);
				fprintf(timeFile, "%d\t", NUMBER_TARGETS);
				fprintf(timeFile, "%s\t", argv[4]);
				fprintf(timeFile, "%ld.%06ld\t",timer_mm_array[iter].tv_usec/1000000, timer_mm_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_init_array[iter].tv_usec/1000000, timer_init_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_H2D_array[iter].tv_usec/1000000, timer_H2D_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_D2H_array[iter].tv_usec/1000000, timer_D2H_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_kernel1_array[iter].tv_usec/1000000, timer_kernel1_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_kernel2_array[iter].tv_usec/1000000, timer_kernel2_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_plotAlignments_array[iter].tv_usec/1000000, timer_plotAlignments_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%ld.%06ld\t",timer_iter_total_array[iter].tv_usec/1000000, timer_iter_total_array[iter].tv_usec%1000000);
				fprintf(timeFile, "%d\n",total_alignments);

			}
			fprintf(timeFile, "#Total execution time: %ld.%06ld\n",timer_total.tv_usec/1000000, timer_total.tv_usec%1000000);
			fprintf(timeFile, "\n");
			fclose(timeFile);

		} else{
			fprintf(stderr,"Cannot create file: %s\n", argv[9]);
			exit(EXIT_FAILURE);
		}

	}
#endif

	return(0);
}
Beispiel #11
0
rcl_status cl_create_buffer(struct client_state* state, uint64_t size,
	buffer_type type, buffer_t* buffer)
{
	int32_t index;

	uint8_t zero = 0;

	cl_int error;
	cl_mem_flags flags = 0;

	rcl_status status = RCL_OK;

	if (size == 0)
		return RCL_INVALID_VALUE;

	struct buffer_state* buffer_state = malloc(sizeof(struct buffer_state));
	if (!buffer_state)
		return RCL_HOST_RESOURCE;
	buffer_state->original = NULL;
	buffer_state->type = type;
	buffer_state->size = size;

	log_print(log_notice, "Creating buffer, size: %" PRIu64", type: %s", size,
		type == BUFFER_READ_ONLY ? "read only" : (type == BUFFER_WRITE_ONLY
				? "write only" : "read/write"));

	switch (type) {
		case BUFFER_READ_ONLY:
			flags |= CL_MEM_READ_ONLY;
			break;
		case BUFFER_WRITE_ONLY:
			flags |= CL_MEM_WRITE_ONLY;
			break;
		case BUFFER_READ_WRITE:
			flags |= CL_MEM_READ_WRITE;
			break;
	}

	buffer_state->id = clCreateBuffer(state->context, flags, size, NULL,
		&error);
	if (error != CL_SUCCESS) {
		log_print(log_error, "Error creating buffer: %s", clerror_name(error));
		status = opencl_error(error);
		goto out_state;
	}

	if (type == BUFFER_WRITE_ONLY || type == BUFFER_READ_WRITE) {
		if (cl_feature_check(CL_FEATURE_FILL_BUFFER)) {
			error = clEnqueueFillBuffer(state->command_queue, buffer_state->id,
				&zero, sizeof(zero), 0, size, 0, NULL, NULL);
		} else
			error = cl_utils_clear_buffer(state, buffer_state->id, size);
		if (error != CL_SUCCESS) {
			status = opencl_error(error);
			goto out_state;
		}

		buffer_state->original = malloc(size);
		if (!buffer_state->original) {
			status = RCL_HOST_RESOURCE;
			goto out_mem;
		}
		memset(buffer_state->original, 0, size);
	}

	index = vector_add(&state->buffers, &buffer_state);
	if (index < 0) {
		status = RCL_HOST_RESOURCE;
		goto out_mem;
	}

	*buffer = index + 1;
	return RCL_OK;

out_mem:
	clReleaseMemObject(buffer_state->id);
out_state:
	if (buffer_state)
		free(buffer_state->original);
	free(buffer_state);
	return status;
}
Beispiel #12
0
clsparseStatus
reduce_by_key(
    int keys_first,
    int keys_last,
    int values_first,
    cl_mem keys_input,
    cl_mem values_input,
    cl_mem keys_output,
    cl_mem values_output,
    int *count,
    clsparseControl control
)
{

    cl_int l_Error;

    /**********************************************************************************
     * Compile Options
     *********************************************************************************/
    const int kernel0_WgSize = WAVESIZE*KERNEL02WAVES;
    const int kernel1_WgSize = WAVESIZE*KERNEL1WAVES;
    const int kernel2_WgSize = WAVESIZE*KERNEL02WAVES;

    //const std::string params = std::string() +
    //          " -DKERNEL0WORKGROUPSIZE=" + std::to_string(kernel0_WgSize)
    //        + " -DKERNEL1WORKGROUPSIZE=" + std::to_string(kernel1_WgSize)
    //        + " -DKERNEL2WORKGROUPSIZE=" + std::to_string(kernel2_WgSize);
    const std::string params;

    cl::Context context = control->getContext();
    std::vector<cl::Device> dev = context.getInfo<CL_CONTEXT_DEVICES>();
    int computeUnits  = dev[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >( );
    int wgPerComputeUnit = dev[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >( );


    int resultCnt = computeUnits * wgPerComputeUnit;
    cl_uint numElements = keys_last - keys_first + 1;

    size_t sizeInputBuff = numElements;
    int modWgSize = (sizeInputBuff & (kernel0_WgSize-1));
    if( modWgSize )
    {
        sizeInputBuff &= ~modWgSize;
        sizeInputBuff += kernel0_WgSize;
    }
    cl_uint numWorkGroupsK0 = static_cast< cl_uint >( sizeInputBuff / kernel0_WgSize );

    size_t sizeScanBuff = numWorkGroupsK0;
    modWgSize = (sizeScanBuff & (kernel0_WgSize-1));
    if( modWgSize )
    {
        sizeScanBuff &= ~modWgSize;
        sizeScanBuff += kernel0_WgSize;
    }

    cl_mem tempArrayVec = clCreateBuffer(context(),CL_MEM_READ_WRITE, (numElements)*sizeof(int), NULL, NULL );

    /**********************************************************************************
     *  Kernel 0
     *********************************************************************************/

    cl::Kernel kernel0 = KernelCache::get(control->queue,"reduce_by_key", "OffsetCalculation", params);

    KernelWrap kWrapper0(kernel0);

    kWrapper0 << keys_input << tempArrayVec
              << numElements;

    cl::NDRange local0(kernel0_WgSize);
    cl::NDRange global0(sizeInputBuff);

    cl_int status = kWrapper0.run(control, global0, local0);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    int init = 0;

    scan(0,
	 numElements - 1,
         tempArrayVec,
         tempArrayVec,
         0,
         0,
         control
         );

    int pattern = 0;
    cl_mem keySumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL );
    cl_mem preSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL );
    cl_mem postSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE,(sizeScanBuff)*sizeof(int), NULL, NULL );
    clEnqueueFillBuffer(control->queue(), keySumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);
    clEnqueueFillBuffer(control->queue(), preSumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);
    clEnqueueFillBuffer(control->queue(), postSumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);


    /**********************************************************************************
     *  Kernel 1
     *********************************************************************************/

    cl::Kernel kernel1 = KernelCache::get(control->queue,"reduce_by_key", "perBlockScanByKey", params);

    KernelWrap kWrapper1(kernel1);

    kWrapper1 << tempArrayVec
	      << values_input
              << numElements
	      << keySumArray
	      << preSumArray;

    cl::NDRange local1(kernel0_WgSize);
    cl::NDRange global1(sizeInputBuff);

    status = kWrapper1.run(control, global1, local1);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    /**********************************************************************************
     *  Kernel 2
     *********************************************************************************/
    cl_uint workPerThread = static_cast< cl_uint >( sizeScanBuff / kernel1_WgSize );

    cl::Kernel kernel2 = KernelCache::get(control->queue,"reduce_by_key", "intraBlockInclusiveScanByKey", params);

    KernelWrap kWrapper2(kernel2);

    kWrapper2 << keySumArray << preSumArray
              << postSumArray << numWorkGroupsK0 << workPerThread;

    cl::NDRange local2(kernel1_WgSize);
    cl::NDRange global2(kernel1_WgSize);

    status = kWrapper2.run(control, global2, local2);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    /**********************************************************************************
     *  Kernel 3
     *********************************************************************************/

    cl::Kernel kernel3 = KernelCache::get(control->queue,"reduce_by_key", "keyValueMapping", params);

    KernelWrap kWrapper3(kernel3);

    kWrapper3 << keys_input << keys_output
              << values_input << values_output << tempArrayVec
              << keySumArray << postSumArray << numElements;

    cl::NDRange local3(kernel0_WgSize);
    cl::NDRange global3(sizeInputBuff);

    status = kWrapper3.run(control, global3, local3);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    int *h_result = (int *) malloc (sizeof(int));

    clEnqueueReadBuffer(control->queue(),
                        tempArrayVec,
                        1,
                       (numElements-1)*sizeof(int),
                        sizeof(int),
                        h_result,
                        0,
                        0,
                        0);

    *count = *(h_result);
    //printf("h_result = %d\n", *count );

    //release buffers
    clReleaseMemObject(tempArrayVec);
    clReleaseMemObject(preSumArray);
    clReleaseMemObject(postSumArray);
    clReleaseMemObject(keySumArray);

    return clsparseSuccess;
}   //end of reduce_by_key
int main(int argc, char *argv[])
{
	// selected platform and device number
	cl_uint pn = 0, dn = 0;

	// OpenCL error
	cl_int error;

	// generic iterator
	cl_uint i;

	// major/minor version of the platform OpenCL version
	cl_uint ocl_major, ocl_minor;

	// set platform/device num from command line
	if (argc > 1)
		pn = atoi(argv[1]);
	if (argc > 2)
		dn = atoi(argv[2]);

	error = clGetPlatformIDs(0, NULL, &np);
	CHECK_ERROR("getting amount of platform IDs");
	printf("%u platforms found\n", np);
	if (pn >= np) {
		fprintf(stderr, "there is no platform #%u\n" , pn);
		exit(1);
	}
	// only allocate for IDs up to the intended one
	platform = calloc(pn+1,sizeof(*platform));
	// if allocation failed, next call will bomb. rely on this
	error = clGetPlatformIDs(pn+1, platform, NULL);
	CHECK_ERROR("getting platform IDs");

	// choose platform
	p = platform[pn];

	error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting platform name");
	printf("using platform %u: %s\n", pn, strbuf);

	error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting platform version");
	// we need 1.2 at least
	i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor);
	if (i != 2) {
		fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n",
			__func__, __LINE__);
		exit(1);
	}
	if (ocl_major == 1 && ocl_minor < 2) {
		fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n",
			__func__, __LINE__, strbuf);
		exit(1);
	}

	error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
	CHECK_ERROR("getting amount of device IDs");
	printf("%u devices found\n", nd);
	if (dn >= nd) {
		fprintf(stderr, "there is no device #%u\n", dn);
		exit(1);
	}
	// only allocate for IDs up to the intended one
	device = calloc(dn+1,sizeof(*device));
	// if allocation failed, next call will bomb. rely on this
	error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL);
	CHECK_ERROR("getting device IDs");

	// choose device
	d = device[dn];
	error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting device name");
	printf("using device %u: %s\n", dn, strbuf);

	error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE,
			sizeof(gmem), &gmem, NULL);
	CHECK_ERROR("getting device global memory size");
	error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
			sizeof(alloc_max), &alloc_max, NULL);
	CHECK_ERROR("getting device max memory allocation size");

	// create context
	ctx_prop[1] = (cl_context_properties)p;
	ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error);
	CHECK_ERROR("creating context");

	// create queue
	q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error);
	CHECK_ERROR("creating queue");

	// create program
	pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error);
	CHECK_ERROR("creating program");

	// build program
	error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL);
	CHECK_ERROR("building program");

	// get kernel
	k = clCreateKernel(pg, "add", &error);
	CHECK_ERROR("creating kernel");

	error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
			sizeof(wgm), &wgm, NULL);
	CHECK_ERROR("getting preferred workgroup size multiple");

	// number of elements on which kernel will be launched. it's ok if we don't
	// cover every byte of the buffers
	nels = alloc_max/sizeof(cl_float);

	gws = ROUND_MUL(nels, wgm);

	printf("will use %zu workitems grouped by %zu to process %u elements\n",
			gws, wgm, nels);

	// we will try and allocate at least one buffer more than needed to fill
	// the device memory, and no less than 3 anyway
	nbuf = gmem/alloc_max + 1;
	if (nbuf < 3)
		nbuf = 3;

#define MB (1024*1024.0)

	printf("will try allocating %u host buffers of %gMB each to overcommit %gMB\n",
			nbuf, alloc_max/MB, gmem/MB);

	hostbuf = calloc(nbuf, sizeof(cl_mem));

	if (!hostbuf) {
		fprintf(stderr, "could not prepare support for %u buffers\n", nbuf);
		exit(1);
	}

	// allocate ‘host’ buffers
	for (i = 0; i < nbuf; ++i) {
		hostbuf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, alloc_max,
				NULL, &error);
		CHECK_ERROR("allocating host buffer");
		printf("host buffer %u allocated\n", i);
		error = clEnqueueMigrateMemObjects(q, 1, hostbuf + i,
				CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED,
				0, NULL, NULL);
		CHECK_ERROR("migrating buffer to host");
		printf("buffer %u migrated to host\n", i);
	}

	// allocate ‘device’ buffers
	for (i = 0; i < 2; ++i) {
		devbuf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_max,
				NULL, &error);
		CHECK_ERROR("allocating devbuffer");
		printf("dev buffer %u allocated\n", i);
		if (i == 0) {
			float patt = 0;
			error = clEnqueueFillBuffer(q, devbuf[0], &patt, sizeof(patt),
					0, nels*sizeof(patt), 0, NULL, &mem_evt);
			CHECK_ERROR("enqueueing memset");
		}
	}
	error = clWaitForEvents(1, &mem_evt);
	CHECK_ERROR("waiting for buffer fill");
	clReleaseEvent(mem_evt); mem_evt = NULL;

	// use the buffers
	for (i = 0; i < nbuf; ++i) {
		printf("testing buffer %u\n", i);

		// for each buffer, we do a setup on CPU and then use it as second
		// argument for the kernel
		hbuf = clEnqueueMapBuffer(q, hostbuf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
				0, alloc_max, 0, NULL, NULL, &error);
		CHECK_ERROR("mapping buffer");
		for (e = 0; e < nels; ++e)
			hbuf[e] = i;
		error = clEnqueueUnmapMemObject(q, hostbuf[i], hbuf, 0, NULL, NULL);
		CHECK_ERROR("unmapping buffer");
		hbuf = NULL;

		// copy ‘host’ to ‘device’ buffer
		clEnqueueCopyBuffer(q, hostbuf[i], devbuf[1], 0, 0, alloc_max,
				0, NULL, NULL);
		// make sure all pending actions are completed
		error =	clFinish(q);
		CHECK_ERROR("settling down");

		clSetKernelArg(k, 0, sizeof(cl_mem), devbuf);
		clSetKernelArg(k, 1, sizeof(cl_mem), devbuf + 1);
		clSetKernelArg(k, 2, sizeof(nels), &nels);
		error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm,
				0, NULL, &krn_evt);
		CHECK_ERROR("enqueueing kernel");

		error = clEnqueueCopyBuffer(q, devbuf[0], hostbuf[0],
				0, 0, alloc_max, 1, &krn_evt, &mem_evt);
		CHECK_ERROR("copying data to host");

		expected = i*(i+1)/2.0f;
		hbuf = clEnqueueMapBuffer(q, hostbuf[0], CL_TRUE, CL_MAP_READ,
				0, alloc_max, 1, &mem_evt, NULL, &error);
		CHECK_ERROR("mapping buffer 0");
		for (e = 0; e < nels; ++e)
			if (hbuf[e] != expected) {
				fprintf(stderr, "mismatch @ %u: %g instead of %g\n",
						e, hbuf[e], expected);
				exit(1);
			}
		error = clEnqueueUnmapMemObject(q, hostbuf[0], hbuf, 0, NULL, NULL);
		CHECK_ERROR("unmapping buffer 0");
		hbuf = NULL;
		clReleaseEvent(krn_evt);
		clReleaseEvent(mem_evt);
		krn_evt = mem_evt = NULL;
	}

	for (i = 1; i <= 2; ++i) {
		clReleaseMemObject(devbuf[2 - i]);
		printf("dev buffer %u freed\n", nbuf  - i);
	}
	for (i = 1; i <= nbuf; ++i) {
		clReleaseMemObject(hostbuf[nbuf - i]);
		printf("host buffer %u freed\n", nbuf  - i);
	}

	return 0;
}