void simulation_step() {
	cl_event ev_kernel; cl_int err;

	size_t glb_sz[1] = { N };
	err = clEnqueueNDRangeKernel(ocl.queue, ocl.kernel_step, 1, NULL, glb_sz, NULL, 0, NULL, &ev_kernel);
	CLU_ERRCHECK(err, "Failed to enqueue kernel");
	// read result...
	err = clEnqueueReadBuffer(ocl.queue, ocl.mem_bodies, CL_TRUE, 0, N * sizeof(body), &B, 0, NULL, NULL);
	CLU_ERRCHECK(err, "Failed to read result");
	// and evaluate time
	totalTime += cluGetDurationMS(ev_kernel);
	steps++;
}
void cleanup_ocl() {
	cl_int err;

	// finalization
	err = clFinish(ocl.queue);
	err |= clReleaseKernel(ocl.kernel_step);
	err |= clReleaseProgram(ocl.prog);
	err |= clReleaseMemObject(ocl.mem_bodies);
	err |= clReleaseCommandQueue(ocl.queue);
	err |= clReleaseContext(ocl.ctx);
	CLU_ERRCHECK(err, "Failed during ocl cleanup");
}
double getDurationMS(cl_event event) {
	cl_ulong start = 0;
	cl_ulong end = 0;
	cl_int ret;

	ret = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), (void *)&start, NULL);
	ret |= clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), (void *)&end, NULL);

	CLU_ERRCHECK(ret, "Unable to read profiling information!");

	// the values are measured in nano seconds!
	return (double)(end - start) / 1000000.0;
}
// ----------- opencl -------------
void init_ocl() {
	cl_int err;

	ocl.id = cluInitDevice(CL_DEVICE, &ocl.ctx, &ocl.queue);
	printf("OCL Device: %s\n", cluGetDeviceDescription(ocl.id, CL_DEVICE));

	// create kernel from source
	ocl.prog = cluBuildProgramFromFile(ocl.ctx, ocl.id, KERNEL_FILE_NAME, NULL);
	ocl.kernel_step = clCreateKernel(ocl.prog, "simulation_step", &err);
	CLU_ERRCHECK(err, "Failed to create 'simulation_step' kernel from program");

	// create memory buffer
	ocl.mem_bodies = clCreateBuffer(ocl.ctx, CL_MEM_READ_WRITE, N * sizeof(body), NULL, &err);
	CLU_ERRCHECK(err, "Failed to create memory buffer");

	// fill memory buffer
	err = clEnqueueWriteBuffer(ocl.queue, ocl.mem_bodies, CL_FALSE, 0, N * sizeof(body), B, 0, NULL, NULL);
	CLU_ERRCHECK(err, "Failed to write data to device");

	// set arguments
	cluSetKernelArguments(ocl.kernel_step, 1, sizeof(cl_mem), (void *)&ocl.mem_bodies);
}
int main()
{
//	unsigned long start_time = time_ms();

	// init matrix
	memset(u, 0, N*N*sizeof(VALUE));

	printf("Jacobi with  N=%d, L_SZ=%d, IT=%d\n", N, L_SZ, IT);
	printf("Kernel file name: %s\n", KERNEL_FILE_NAME);

	// init F
	for(int i=0; i<N; i++)
		for(int j=0; j<N; j++)
			f[i][j] = init_func(i, j);

	VALUE factor = pow((VALUE)1/N, 2);

	// ocl initialization
	cl_context context;
	cl_command_queue command_queue;
	cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue);

	// create memory buffers
	cl_int err;
	cl_mem matrix_U = clCreateBuffer(context, CL_MEM_READ_WRITE, N * N * sizeof(VALUE), NULL, &err);
	cl_mem matrix_F = clCreateBuffer(context, CL_MEM_READ_ONLY, N * N * sizeof(VALUE), NULL, &err);
	cl_mem matrix_TMP = clCreateBuffer(context, CL_MEM_READ_WRITE, N * N * sizeof(VALUE), NULL, &err);
	CLU_ERRCHECK(err, "Failed to create buffer for matrix");

	// used for profiling info
	cl_event ev_write_U;
	cl_event ev_write_F;
	cl_event ev_kernel;
	cl_event ev_read_TMP;

	double write_total, read_total, kernel_total;
	write_total = read_total = kernel_total = 0.0;

	// create kernel from source
	char tmp[1024];
	sprintf(tmp, "-DN=%i -DVALUE=%s", N, EXPAND_AND_QUOTE(VALUE));
	cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp);
	cl_kernel kernel = clCreateKernel(program, "jacobi", &err);
	CLU_ERRCHECK(err, "Failed to create matrix_mul kernel from program");

	/* ---------------------------- main part ----------------------------------- */

	// also initialize target matrix with zero values!!!
	err = clEnqueueWriteBuffer(command_queue, matrix_TMP, CL_TRUE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_write_U);
	CLU_ERRCHECK(err, "Failed to write matrix to device");
	// write f to device
	err = clEnqueueWriteBuffer(command_queue, matrix_F, CL_FALSE, 0, N * N * sizeof(VALUE), f, 0, NULL, &ev_write_F);
	CLU_ERRCHECK(err, "Failed to write matrix F to device");

	// write matrix u to device
	err = clEnqueueWriteBuffer(command_queue, matrix_U, CL_FALSE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_write_U);
	CLU_ERRCHECK(err, "Failed to write matrix U to device");

	// define global work size
	size_t g_work_size[2] = {N, N};
	size_t l_work_size[2] = {L_SZ, L_SZ};

	cl_mem buffer_u;
	cl_mem buffer_tmp;

	for (int i = 0; i < IT; ++i) {
		// swap U and TMP arguments based on iteration counter
		if(i % 2 == 0) {
			buffer_u = matrix_U;
			buffer_tmp = matrix_TMP;
		} else {
			buffer_u = matrix_TMP;
			buffer_tmp = matrix_U;
		}
		// compute memory block dimensions
		int block_dim = (L_SZ + 2) * (L_SZ + 2);
		// set kernel arguments
		cluSetKernelArguments(kernel, 5,
							  sizeof(cl_mem), (void *)&buffer_u,
							  sizeof(cl_mem), (void *)&matrix_F,
							  sizeof(cl_mem), (void *)&buffer_tmp,
							  // local memory buffer
							  block_dim * sizeof(VALUE), NULL,
							  sizeof(VALUE), (void *)&factor);

		// execute kernel
		err = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, g_work_size, l_work_size, 0, NULL, &ev_kernel);
		CLU_ERRCHECK(err, "Failed to enqueue 2D kernel");
		// wait until execution completes
		clWaitForEvents(1, &ev_kernel);
		// add profiling information
		kernel_total += getDurationMS(ev_kernel);
	}

	// copy results back to host
	err = clEnqueueReadBuffer(command_queue, buffer_tmp, CL_TRUE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_read_TMP);
	CLU_ERRCHECK(err, "Failed reading back result");

	// compute profiling information
	write_total += getDurationMS(ev_write_U);
	write_total += getDurationMS(ev_write_F);
	read_total += getDurationMS(ev_read_TMP);


	/* ---------------------------- evaluate results ---------------------------------- */
	// print result
	printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE));
//	printf("Verification: %4s\n", (success) ? "OK" : "ERR");

	printf("Write total:      %9.4f ms\n", write_total);
	printf("Read total:       %9.4f ms\n", read_total);
	printf("Kernel execution: %9.4f ms\n", kernel_total);
	printf("Time total:       %9.4f ms\n\n", write_total + read_total + kernel_total);
#ifdef DEBUG
	print_result(u);
#endif



	/* ---------------------------- finalization ------------------------------------- */

	err = clFinish(command_queue);
	err |= clReleaseKernel(kernel);
	err |= clReleaseProgram(program);
	err |= clReleaseMemObject(matrix_U);
	err |= clReleaseMemObject(matrix_F);
	err |= clReleaseMemObject(matrix_TMP);
	err |= clReleaseCommandQueue(command_queue);
	err |= clReleaseContext(context);
	CLU_ERRCHECK(err, "Failed during ocl cleanup");

	return EXIT_SUCCESS;
}
int main(int argc, char** argv){
	
	srand(time(NULL));
	
	if(argc != 2) {
		printf("Usage: search [elements]\nExample: scan 10000\n");
		return -1;
	}
	
	unsigned long long start_time = time_ms();
	int event_amount=2;
	int elems = atoi(argv[1]);
	
	cl_int err;
	cl_event* events=allocateMemoryForEvent(event_amount);
	cl_ulong total_downsweep=0,total_hillissteele=0;
	size_t localWorkGroupSize_downSweep[1]={LOCALSIZE};	//must be power of two
	size_t globalWorkGroupSize_downSweep[1]={getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2))};	//calculating
	
	size_t localWorkGroupSize_hillissteele[1]={LOCALSIZE};	//must be power of two
	size_t globalWorkGroupSize_hillissteele[1]={roundUp(LOCALSIZE,elems)};	//calculating worksize
	
	
	int howManyWorkGroups=globalWorkGroupSize_downSweep[0]/LOCALSIZE;	//quotient is power of two, since dividend and divisor are power of two
	int sumBuffer_length_downSweep=howManyWorkGroups;
	int sumBuffer_length_hillis=getPowerOfTwo(roundUp(LOCALSIZE,elems)/LOCALSIZE);	

	VALUE *data = (VALUE*)malloc(elems*sizeof(VALUE));
	VALUE *result_seq=(VALUE*)malloc(elems*sizeof(VALUE));
	VALUE *result=(VALUE*)malloc(elems*sizeof(VALUE));
	VALUE *result_hillissteele=(VALUE*)malloc(elems*sizeof(VALUE));
	VALUE *sum=(VALUE*)malloc(sumBuffer_length_downSweep*sizeof(VALUE));
	VALUE *sum_hillis=(VALUE*)malloc(sumBuffer_length_hillis*sizeof(VALUE));
	

	memset(sum_hillis,0,sumBuffer_length_hillis*sizeof(VALUE));
	memset(result_seq,0,elems*sizeof(VALUE));
	
	// initialize data set (fill randomly)
	for(int j=0; j<elems; ++j) {
		data[j] =rand()%121;
	}
	
//	printResult(data, elems, 4, "INPUT");
	
	/*Sequential Scan*/
	for(int i=1; i<elems; i++){
	    result_seq[i]=result_seq[i-1]+data[i-1];
	}
	
//	printResult(result_seq, elems, 4, "Sequential Algorithm OUTPUT");
		
	//ocl initialization
	size_t deviceInfo;
	cl_context context;
	cl_command_queue command_queue;
	cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue);
	clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t), &deviceInfo,NULL );
  
	
	// create memory buffer
	cl_mem mem_data=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err);
 	cl_mem mem_data_hillis=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err);
	cl_mem mem_result=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err);
	cl_mem mem_result_tmp=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err);
	cl_mem mem_sum=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_downSweep*sizeof(VALUE), NULL, &err);
	cl_mem mem_sum_hillis=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_hillis*sizeof(VALUE), NULL, &err);
	CLU_ERRCHECK(err, "Failed to create Buffer");
    
	err=clEnqueueWriteBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL);
	CLU_ERRCHECK(err, "Failed to write values into mem_sum");
	

	// create kernel from source
	char tmp[1024];
 	sprintf(tmp,"-DVALUE=%s", EXPAND_AND_QUOTE(VALUE));
	cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp);
	cl_kernel kernel_downSweep = clCreateKernel(program, "prefix_scan_downSweep", &err);
	cl_kernel kernel_hillissteele=clCreateKernel(program, "prefix_scan_hillissteele", &err);
	cl_kernel kernel_last_stage= clCreateKernel(program, "prefix_scan_last_stage", &err);
	CLU_ERRCHECK(err,"Could not load source program");
    

	
	/*-------------------------------------DOWNSWEEP-----------------------------------------------*/
	// set arguments
	int border=elems/2;
	int flag=1;
	
	cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_data, sizeof(cl_mem), (void*)&mem_result,
			      sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);

	//execute kernel  	     
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel");		      
	clFinish(command_queue);
	total_downsweep+=getProfileTotalTime(events,1);
	
	//read values back from device
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values");
	
	/*
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values");
	clFinish(command_queue);
	printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM");
	*/
	err=clEnqueueCopyBuffer(command_queue, mem_result, mem_result_tmp, 0, 0, elems*sizeof(VALUE),0,NULL,NULL);
	CLU_ERRCHECK(err,"DownSweep_Failed during copying buffer");
	
	
	/*+++++++++++++++++++++++++++++++++DOWNSWEEP-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++++*/
	flag=0;
	border=sumBuffer_length_downSweep/2;	//since sumbuffer_length is power of two no further adaption is needed
	cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_sum, sizeof(cl_mem), (void*)&mem_sum,
			      sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*sumBuffer_length_downSweep, NULL, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);
	
	howManyWorkGroups>1 ? globalWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups;	//if 1 workgroup make adaption
	howManyWorkGroups>1 ? localWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups;	//if 1 workgroup make adaption
	
	
	//execute kernel
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL,&(events[1])), "DownSweep_Failed to enqueue 2D kernel");		      
	clFinish(command_queue);
	total_downsweep+=getProfileTotalTime(events,1);
	/*
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values");	
	printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM PREFIX");
	*/
	
	/*+++++++++++++++++++++++++++++++++DOWNSWEEP-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/
	border=sumBuffer_length_downSweep;
	flag=1;
	cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result_tmp, sizeof(cl_mem), (void*)&mem_sum, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);
	globalWorkGroupSize_downSweep[0]=getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2));
	localWorkGroupSize_downSweep[0]=LOCALSIZE;
	
	//printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]);
	
	//execute kernel  	     
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel");		      
	clFinish(command_queue);
	total_downsweep+=getProfileTotalTime(events,1);
	//read values back from device
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result_tmp, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values");
	
	
	/*---------------------------------------HILLISSTEELE----------------------------------------------------------*/
	
	
	flag=1;
	border=elems;
	
	cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_data_hillis, sizeof(cl_mem), (void*)&mem_result,
			      sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);

	//execute kernel	
	//printf("GlobalSize: %d\tLocalWorkGroupSize: %d\n",globalWorkGroupSize[0], localWorkGroupSize[0]);
	//printf("Amount of WorkGroups: %d\n", globalWorkGroupSize[0]/localWorkGroupSize[0]);
	
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Inputbuffer");		      
	
	clFinish(command_queue);
	total_hillissteele+=getProfileTotalTime(events,0);
	//read values back from device
	/*
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Failed to read Result Values");
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum_1 Values");
	printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM");
	printResult(result_hillissteele,elems, 4, "HILLISSTEELE Temporary OUTPUT");
	*/
	
	
	/*++++++++++++++++++++++++++++++++++++++HILLISSTEELE-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++*/
	
	flag=0;
	border=sumBuffer_length_hillis;
	cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_sum_hillis, sizeof(cl_mem), (void*)&mem_sum_hillis,
			      sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*howManyWorkGroups*2, NULL, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);

	//execute kernel
	globalWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis;
	localWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis;
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Sumbuffer");		      
	
	clFinish(command_queue);
	total_hillissteele+=getProfileTotalTime(events,0);
	
	//read values back from device
	/*
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum2 Values");
	printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM PREFIX");
	*/
	
	/*+++++++++++++++++++++++++++++++++++++HILLISSTEELE-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/
	
	flag=0;
	border=sumBuffer_length_hillis;
	cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result, sizeof(cl_mem), (void*)&mem_sum_hillis, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);
	
	globalWorkGroupSize_hillissteele[0]=roundUp(LOCALSIZE,elems);
	localWorkGroupSize_hillissteele[0]=LOCALSIZE;
	
	//printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]);
	
	//execute kernel  	     
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue kernel_Last_stage");		      
	clFinish(command_queue);
	total_hillissteele+=getProfileTotalTime(events,0);
	
	//read values back from device
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Hillissteele_Failed to read Result Values");
	
	
	/*-------------------------FINISHED---------------------------------------------*/
	
	//printResult(result_hillissteele, elems, 4, "HILLISSTEELE OUTPUT");
	//printResult(result, elems, 4, "IMPROVED IMPLEMENTATION OUTPUT");
	
	//verify results
	verifyResult(result_seq,result,elems, "Verifying result of DownSweep for bigger array size");
	verifyResult(result_seq,result_hillissteele,elems, "Verifying result of HILLISSTEELE for bigger array size");
	
	
	printProfileInfo(total_downsweep,"Improved Algorithm Time:");
	printProfileInfo(total_hillissteele,"Hillis & Steele Time:");
	printf("\nDEVICE INFO MAX_WORK_GROUP_SIZE: %d\n", (int) deviceInfo);
	printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE));
	printf("Done, took %16llu ms\n", time_ms()-start_time);
    
	
	
	// finalization
	
	for(int i=0; i<event_amount; i++){
	    clReleaseEvent(events[i]);
	}
	
	err =  clFinish(command_queue);
	err |= clReleaseKernel(kernel_downSweep);
	err |= clReleaseKernel(kernel_last_stage);
	err |= clReleaseKernel(kernel_hillissteele);
	err |= clReleaseProgram(program);
	err |= clReleaseMemObject(mem_data);
	err |= clReleaseMemObject(mem_data_hillis);
	err |= clReleaseMemObject(mem_result);
	err |= clReleaseMemObject(mem_result_tmp);
	err |= clReleaseMemObject(mem_sum);
	err |= clReleaseMemObject(mem_sum_hillis);
	err |= clReleaseCommandQueue(command_queue);
	err |= clReleaseContext(context);
	CLU_ERRCHECK(err, "Failed during ocl cleanup");
    
	free(events);
	free(result);
	free(result_hillissteele);
	free(result_seq);
	free(sum);
	free(sum_hillis);
	
	return EXIT_SUCCESS; 
}