C++ (Cpp) icl_read_buffer примеры использования

Пример #1

0

Показать файл

Файл: 3DPLc_host.cpp Проект: LarsHunger/RAFT

void ocl_make_fieldbu_irregular(int nr_lines, int field_nr_points, int linelength, const cl_double4* vecs, const double* y, cl_double4* RF, double resolutionfactor)
{
	icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines,        &vecs[0], NULL, NULL);
	icl_write_buffer(y_buf,    CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL);
	icl_write_buffer(RF_buf,   CL_FALSE, sizeof(cl_double4) * field_nr_points, &RF[0], NULL, NULL);

	size_t size = field_nr_points;
	size_t szLocalWorkSize = LOCAL_GROUP_SIZE; 
	float multiplier = size/(float)szLocalWorkSize;
	if(multiplier > (int)multiplier) multiplier += 1;
	size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;
	
	printf("\n\nocl_make_field_irregular with blocking and unrolling (%d), global size %d, local size %d\n", LOOP_UNROLL, szGlobalWorkSize, szLocalWorkSize);	
	icl_run_kernel(irr_fieldbu_kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6,
		sizeof(cl_int), (void *)&nr_lines,
		sizeof(cl_int), (void *)&linelength,
		(size_t)0, (void *)vecs_buf,
		(size_t)0, (void *)y_buf,
		(size_t)0, (void *)RF_buf,
		sizeof(cl_double), (void *)&resolutionfactor
	);

	icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double4) * field_nr_points, &RF[0], NULL, NULL);
	icl_finish(device);
}

Пример #2

0

Показать файл

Файл: 3DPLc_host.cpp Проект: LarsHunger/RAFT

void ocl_make_field1bu(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength,
					const cl_double4* vecs, const double* y, double* RF, 
					double resolutionfactor)
{
	icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL);
	icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL);	

	size_t szGlobalWorkSize[1] = { fielddim_x*fielddim_y*fielddim_z };
	size_t szLocalWorkSize[1] = { LOCAL_GROUP_SIZE }; 	
		
	printf("\n\nocl_make_field1 with blocking and unrolling factor %d, global size (%d), local size (%d)\n", LOOP_UNROLL, szGlobalWorkSize[0], szLocalWorkSize[0]);
	icl_run_kernel(reg_field1bu_kernel, 1, szGlobalWorkSize, szLocalWorkSize, NULL, NULL, 9,
		sizeof(cl_int), (void *)&nr_lines,
		sizeof(cl_int), (void *)&fielddim_x,
		sizeof(cl_int), (void *)&fielddim_y,
		sizeof(cl_int), (void *)&fielddim_z,
		sizeof(cl_int), (void *)&linelength,
		(size_t)0, (void *)vecs_buf,
		(size_t)0, (void *)y_buf,
		(size_t)0, (void *)RF_buf,
		sizeof(cl_double), (void *)&resolutionfactor
	);

	icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double) * fielddim_x * fielddim_y * fielddim_z, &RF[0], NULL, NULL);
	icl_finish(device);
}

Пример #3

0

Показать файл

Файл: 3DPLc_host.cpp Проект: LarsHunger/RAFT

/* unrool of all loops */
void ocl_make_field3(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength,
					const cl_double4* vecs, const double* y, double* RF, 
					double resolutionfactor)
{
	icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL);
	icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL);	
			
	size_t szLocalWorkSize = LOCAL_GROUP_SIZE; 	
	size_t size = nr_lines * fielddim_x * fielddim_y * fielddim_z;
	float multiplier = size/(float)szLocalWorkSize;
	if(multiplier > (int)multiplier) multiplier += 1;
	size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;	
	
	printf("\n\nocl_make_field3, global size (%d), local size (%d)\n", szGlobalWorkSize, szLocalWorkSize);
	icl_run_kernel(reg_field3_kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 9,
		sizeof(cl_int), (void *)&nr_lines,
		sizeof(cl_int), (void *)&fielddim_x,
		sizeof(cl_int), (void *)&fielddim_y,
		sizeof(cl_int), (void *)&fielddim_z,
		sizeof(cl_int), (void *)&linelength,
		(size_t)0, (void *)vecs_buf,
		(size_t)0, (void *)y_buf,
		(size_t)0, (void *)RF_buf,
		sizeof(cl_double), (void *)&resolutionfactor
	);

	icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double) * fielddim_x * fielddim_y * fielddim_z, &RF[0], NULL, NULL);
	icl_finish(device);
}

Пример #4

0

Показать файл

Файл: sinewave.c Проект: 8l/insieme

int main(int argc, const char* argv[]) {
        icl_args* args = icl_init_args();
        icl_parse_args(argc, argv, args);
	
	chdir(PATH);

    int size = args->size;
    icl_print_args(args);

	cl_float4* output = (cl_float4*)malloc(sizeof(cl_float4) * size);

	icl_init_devices(args->device_type);
	
	icl_start_energy_measurement();

	if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(args->device_id);

		icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "sinewave.cl", "sinewave", "", ICL_SOURCE);
		
		size_t szLocalWorkSize =  args->local_size;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;

		for (int i = 0; i < args->loop_iteration; ++i) {
			icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * size);

			icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 2,
												(size_t)0, (void *)buf_output,
												sizeof(cl_int), (void *)&size);
		
			icl_read_buffer(buf_output, CL_TRUE, sizeof(cl_float4) * size, &output[0], NULL, NULL);
			icl_release_buffer(buf_output);
		}
		
		icl_release_kernel(kernel);
	}
	
	icl_stop_energy_measurement();

	// for the test check	
	printf("Result check: OK\n");

        icl_release_args(args);
        icl_release_devices();
        free(output);
}

Пример #5

0

Показать файл

Файл: gpukdtree.c Проект: klois/gpukdtree

void out_snapshot(struct Particle *particles_host, icl_buffer* particles_device, const UINT nParticles, icl_device* dev, const FLOAT current_time)
{
	int j;
	particle_data *P = (particle_data*)malloc(sizeof(particle_data) * nParticles);
	io_header header;
	static int cs = 0;
	char fn[200];

	sprintf(fn, "./output/snapshot_%03d", cs);
	
	icl_read_buffer(particles_device, CL_TRUE, sizeof(struct Particle) * nParticles, particles_host, NULL, NULL);
	energy_statistic(particles_host, nParticles, current_time);
///
//	printf("\tid: %d pos: %g %g %g vel: %g %g %g\n", particles_host[0].id, particles_host[0].pos.x, particles_host[0].pos.y, particles_host[0].pos.z, particles_host[0].vel.x, particles_host[0].vel.y, particles_host[0].vel.z);
///
	for(j = 0; j < nParticles; ++j)
	{
		P[j].Pos[0] = particles_host[j].pos.x;
		P[j].Pos[1] = particles_host[j].pos.y;
		P[j].Pos[2] = particles_host[j].pos.z;
		
		P[j].Vel[0] = particles_host[j].vel.x;
		P[j].Vel[1] = particles_host[j].vel.y;
		P[j].Vel[2] = particles_host[j].vel.z;
		
		//P[j].Mass = particles_host[j].mass; //in header
		P[j].Id = particles_host[j].id;
	
		P[j].Accel[0] = particles_host[j].acc.x;
		P[j].Accel[1] = particles_host[j].acc.y;
		P[j].Accel[2] = particles_host[j].acc.z;
	}
	
	memset(&header, 0, sizeof(io_header));
	header.time = current_time;
	header.num_files = 1;
	header.npart[1] = header.npartTotal[1] = nParticles;
	header.mass[1] = particles_host[0].mass; //for now all particles have the same mass
	
	unsigned blocks = 8199; //0b10000000000111;
	write_snapshot_format2(fn, &header, P, blocks);
	
	cs++;
	
	free(P);
}

Пример #6

0

Показать файл

Файл: 3DPLc_host.cpp Проект: LarsHunger/RAFT

void ocl_make_field_outcore(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength,
					const cl_double4* vecs, const double* y, double* RF, 
					double resolutionfactor)
{
	size_t xy = fielddim_x*fielddim_y;
	size_t z = fielddim_z;
	size_t overall_size = xy * z;
	size_t chunk_size = OUTOFCORE_SIZE;
	icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL);
	icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL);	

	printf("ocl_make_field out-of-core with overall size of %d, chunked by %d\n", overall_size, chunk_size);

	// for each chunk, we send and receive back a new part of the field
	for(size_t offset = 0, bid = 0; offset < overall_size; offset += chunk_size, bid++)
	{	
		printf("block %d offset %d,", bid, offset);
		size_t current_chunk_size = MIN(overall_size-offset, chunk_size);
		size_t szGlobalWorkSize[1] = { current_chunk_size };
		size_t szLocalWorkSize[1] = { LOCAL_GROUP_SIZE }; 	 	
		unsigned long long in_offset = offset;

//		printf("\nocl_make_field out-of-core, global size (%d), local size (%d)\n", szGlobalWorkSize[0], szLocalWorkSize[0]);
		icl_run_kernel(reg_field_outofcore_kernel, 1, szGlobalWorkSize, szLocalWorkSize, NULL, NULL, 10,	
			sizeof(cl_int), (void*) &nr_lines,
			sizeof(cl_int), (void*) &fielddim_x,
			sizeof(cl_int), (void*) &fielddim_y,
			sizeof(cl_int), (void*) &fielddim_z,
			sizeof(cl_int), (void*) &linelength,
			(size_t)0, (void*) vecs_buf,
			(size_t)0, (void*) y_buf,
			(size_t)0, (void*) RF_buf,
			sizeof(cl_double), (void*) &resolutionfactor,
			sizeof(cl_ulong), (void*) &in_offset
		);
		icl_read_buffer(&RF_buf[0], CL_FALSE, sizeof(cl_double) * current_chunk_size, &RF[offset], NULL, NULL);
	}	
	icl_finish(device);
	printf("\n");
}

Пример #7

0

Показать файл

Файл: gpukdtree.c Проект: klois/gpukdtree

UINT compute_acceleration(UINT mode, icl_buffer* nodelist, icl_buffer* particles, const UINT nParticles, const FLOAT eps, UINT treeHeight, icl_device* dev, struct Particle* ref)
{
	static icl_buffer* kdTree = NULL;
		
	if(kdTree == NULL) //just for the first time to initialize acceleration for opening criterion
	{
		kdTree = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct KdNode) * (nParticles * 2 - 1));
		treeHeight = buildTree(nodelist, particles, kdTree, nParticles, dev);
		// first walk through the entire tree since acceleration is 0
		printf("First tree walk\n");
		walk(kdTree, particles, nParticles, eps, dev);
#if timing == 1
		// store acceleration of particles by unfolding the entire tree = direct force summation, used for correctness validation
		printf("Pruned tree walk\n");
                icl_read_buffer(particles, CL_TRUE, sizeof(struct Particle) * nParticles, &ref[0], NULL, NULL);
		// second walk with pruned tree
		walk(kdTree, particles, nParticles, eps, dev);
#endif
	}
	
	else if(mode == 1) //rebuild the tree
	{
		treeHeight = buildTree(nodelist, particles, kdTree, nParticles, dev);
		walk(kdTree, particles, nParticles, eps, dev);
	}
	else if(mode == 2) //TODO: dynamic tree update
	{
		updateTree(nodelist, particles, kdTree, nParticles, treeHeight, dev);
		walk(kdTree, particles, nParticles, eps, dev);
	}
	else //end of sim, release kdTree buffer
	{
		icl_release_buffer(kdTree);
	}

	return treeHeight;
}

Пример #8

0

Показать файл

Файл: mers_twister.c Проект: 8l/insieme

int main(int argc, const char* argv[]) {
	chdir(PATH);
	icl_args* args = icl_init_args();
	icl_parse_args(argc, argv, args);
        icl_print_args(args);

	int size = args->size;

	cl_uint* ma = (cl_uint*) malloc(sizeof(cl_uint) * size);
	cl_uint* b = (cl_uint*) malloc(sizeof(cl_uint) * size);
	cl_uint* c = (cl_uint*) malloc(sizeof(cl_uint) * size);
	cl_uint* seed = (cl_uint*) malloc(sizeof(cl_uint) * size);
	cl_float4* result = (cl_float4*)malloc(sizeof(cl_float4) * size);

	for (cl_uint i = 0; i < size; ++i) { 
		ma[i] = i; b[i] = i; c[i] = i; seed[i] = i;
	}
	
	icl_init_devices(args->device_type);

	icl_start_energy_measurement();

	if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(args->device_id);

		icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "mers_twister.cl", "mersenne_twister", "", ICL_SOURCE);

		size_t szLocalWorkSize = args->local_size;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;
		
		for (int i = 0; i < args->loop_iteration; ++i) {
			icl_buffer* buf_ma = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size);
			icl_buffer* buf_b = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size);
			icl_buffer* buf_c = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size);
			icl_buffer* buf_seed = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size);
			icl_buffer* buf_result = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * size);

			icl_write_buffer(buf_ma, CL_FALSE, sizeof(cl_uint) * size, &ma[0], NULL, NULL);
			icl_write_buffer(buf_b, CL_FALSE, sizeof(cl_uint) * size, &b[0], NULL, NULL);
			icl_write_buffer(buf_c, CL_FALSE, sizeof(cl_uint) * size, &c[0], NULL, NULL);
			icl_write_buffer(buf_seed, CL_TRUE, sizeof(cl_uint) * size, &seed[0], NULL, NULL);

			icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6,
				(size_t)0, (void *)buf_ma,
				(size_t)0, (void *)buf_b,
				(size_t)0, (void *)buf_c,
				(size_t)0, (void *)buf_seed,
				(size_t)0, (void *)buf_result,
				sizeof(cl_int), (void *)&size
			);

			icl_read_buffer(buf_result, CL_TRUE, sizeof(cl_float4) * size, &result[0], NULL, NULL);
			icl_release_buffers(5, buf_ma, buf_b, buf_c, buf_seed, buf_result);
		}
		
		icl_release_kernel(kernel);
	}
	
	icl_stop_energy_measurement();


	if (args->check_result) {
		printf("======================\n= mersenne twister test\n");
		printf("Check not Implemented\n");
		printf("Result check: OK\n");
	} else {
		printf("Result check: OK\n");
	}
	
	icl_release_devices();
	free(ma);
	free(b);
	free(c);
	free(seed);
	free(result);
	return 0;
}

Пример #9

0

Показать файл

Файл: gpukdtree.c Проект: klois/gpukdtree

//main simulation loop
void run(const FLOAT t_max, const FLOAT eps, const FLOAT ErrTolIntAccuracy, struct Particle *particles_host, icl_buffer* particles_device, const UINT nParticles, icl_device* dev, struct Tree *tree, struct Particle *ref)
{
	UINT k = 0;
	UINT treeHeight = 0;
	//FLOAT dt=1.5e-6;
	//FLOAT dt=1.220703125e-5;
	FLOAT dt=3.05176e-06;
	FLOAT current_time = 0.0; //time before current full timestep (drift), kicks are at current_time-+dt/2.0
	FLOAT timeBetSnapshot = 1e-3;
	FLOAT timeLastSnapshot = 0.0;


/*
	tree->nodelist[0].center_of_mass.x = 1;
	tree->nodelist[0].center_of_mass.y = 2;
	tree->nodelist[0].center_of_mass.z = 3;

	tree->nodelist[0].center_geometric.x = 1.3;
	tree->nodelist[0].center_geometric.y = 2.2;
	tree->nodelist[0].center_geometric.z = 3.1;

	tree->nodelist[0].mass = 77.0;
	tree->nodelist[0].l = 42.7;

	tree->nodelist[0].bounding_box.box[0].x = -3.0;
	tree->nodelist[0].bounding_box.box[0].y = -3.2;
	tree->nodelist[0].bounding_box.box[0].z = -3.3;
	tree->nodelist[0].bounding_box.box[1].x = 3.0;
	tree->nodelist[0].bounding_box.box[1].y = 3.2;
	tree->nodelist[0].bounding_box.box[1].z = 3.3;

	tree->nodelist[0].size = 8;
	tree->nodelist[0].level = 7;
	tree->nodelist[0].address = 17;

	tree->nodelist[0].left_child = 1;
	tree->nodelist[0].right_child = 2;
	tree->nodelist[0].split_dim = 3;
*/
	// create root node in nodelist
	tree->nodelist[0].particlesLow = 0;
	tree->nodelist[0].particlesHigh = nParticles;
	tree->nodelist[0].level = 0;
	//tree->nodelist[0].bounding_box = bounding_box;
	tree->nodelist[0].address = 0;

	icl_buffer* nodelist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Node) * (nParticles * 2 - 1));
	// copy root node to ocl device
	icl_write_buffer(nodelist, CL_TRUE, sizeof(struct Node), &(tree->nodelist[0]), NULL, NULL);

	//snapshot_000 IC with computed code properties (acceleration...)
//	out_snapshot(particles_host, particles_device, nParticles, dev, 0.0); 

	// TODO particles have been resorted during tree construction. Upload them in original sorting for comparison, not needed for correctness REMOVE IT!
//	icl_write_buffer(particles_device, CL_TRUE, sizeof(struct Particle) * nParticles, &particles_host[0], NULL, NULL);

	//just for the first time, kick to half timestep
	treeHeight = compute_acceleration(1, nodelist, particles_device, nParticles, eps, treeHeight, dev, ref);
#if timing == 1
	return;		
#endif
	//dt = calcTimestep(eps, ErrTolIntAccuracy, particles, nParticles);
	kick(dt/2.0, particles_device, nParticles, dev);



//	out_snapshot(particles_host, particles_device, nParticles, dev, 0.0); 



	//TEST
	icl_read_buffer(particles_device, CL_TRUE, sizeof(struct Particle) * nParticles, particles_host, NULL, NULL);
	printf("\tid: %d pos: %g %g %g vel: %g %g %g\n", particles_host[0].id,  particles_host[0].pos.x, particles_host[0].pos.y, particles_host[0].pos.z, particles_host[0].vel.x, particles_host[0].vel.y, particles_host[0].vel.z);
	printf("\tacc: %g %g %g\n", particles_host[0].acc.x, particles_host[0].acc.y, particles_host[0].acc.z);
	//
	
	while(current_time < t_max)
	{
		current_time += dt;
		printf("___step: %d time: %g\n", k++, current_time);
//		printf("\tid: %d pos: %g %g %g vel: %g %g %g\n", )
		
		//drift to next full timestep at current_time
		drift(dt, particles_device, nParticles, dev);
		
		//get new accelerations
		treeHeight = compute_acceleration(2, nodelist, particles_device, nParticles, eps, treeHeight, dev, ref); //TODO: mode 2: implement dynamic tree update
		
		//kick particles to current_time+dt/2.0
		kick(dt, particles_device, nParticles, dev);
		
		//output & energy statistic
		if(current_time-timeLastSnapshot > timeBetSnapshot)
		{
			out_snapshot(particles_host, particles_device, nParticles, dev, current_time);
			timeLastSnapshot = current_time;
		}

		//TEST
		icl_read_buffer(particles_device, CL_TRUE, sizeof(struct Particle) * nParticles, particles_host, NULL, NULL);
		printf("\tid: %d pos: %g %g %g vel: %g %g %g\n", particles_host[0].id,  particles_host[0].pos.x, particles_host[0].pos.y, particles_host[0].pos.z, particles_host[0].vel.x, particles_host[0].vel.y, particles_host[0].vel.z);
		printf("\tacc: %g %g %g\n", particles_host[0].acc.x, particles_host[0].acc.y, particles_host[0].acc.z);
	}
	
	out_snapshot(particles_host, particles_device, nParticles, dev, current_time); //write a snapshot also for the final time
	printf("final time reached: %g\n", current_time);
	
	compute_acceleration(0, nodelist, particles_device, nParticles, eps, treeHeight, dev, ref); //clean up
	
	
	icl_release_buffer(nodelist);

}

Пример #10

0

Показать файл

Файл: gpukdtree.c Проект: klois/gpukdtree

UINT buildTree(icl_buffer *nodelist, icl_buffer *particlesD, icl_buffer *treeD, UINT nParticles, icl_device* dev)
{
	UINT level = 1;
	UINT nNodes = nParticles * 2 - 1;

	icl_timer* timer = icl_init_timer(ICL_MILLI);
//	void icl_start_timer(icl_timer* timer);
	double time = 0;

	// overapproximate size of temporal lists
/*	struct Node** activelist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT activeN = 0;
	struct Node** smalllist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT smallN = 0;
	struct Node** nextlist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT nextN = 0;*/
	icl_buffer* activelist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles);
	icl_buffer* smalllist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles);
	icl_buffer* nextlist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles);
	icl_buffer* sizes = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(UINT) * 5); // holds the current size of each of 3 buffers:
		// 0 activelist
		// 1 nodelist
		// 2 smalllist
		// 3 old max level
		// 4 new max level

	UINT maxNchunks = ((nParticles / fmin(T, chunk_size)) * 2) -1;
//	assert(maxNchunks <= 256 && "adapt implementation"); // TODO allow more than 256 chunks per node
	icl_buffer* chunks = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Chunk) * maxNchunks);
	icl_buffer* bboxes = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct BBox) * maxNchunks);

	size_t localSize = 1;
	size_t globalSize = 1;

/*
struct Particle* particles = (struct Particle*)malloc(3000 * sizeof(struct Particle));
icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * 3000, &particles[0], NULL, NULL);
printf("%f %f %f\n", particles[0].pos.x, particles[0].pos.y, particles[0].pos.z);
*/
	// compile OpenCL kernels
	icl_kernel* init = icl_create_kernel(dev, "kernel/init.cl", "init", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* resetChunks = icl_create_kernel(dev, "kernel/init.cl", "memset_chunks", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* gp2c = icl_create_kernel(dev, "kernel/groupToChunks.cl", "groupParticlesIntoChunks", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* cBBox = icl_create_kernel(dev, "kernel/chunkedBBox.cl", "chunkedBBox", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* bBox = icl_create_kernel(dev, "kernel/bBox.cl", "bBox", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* sln = icl_create_kernel(dev, "kernel/splitLargeNodes.cl", "splitLargeNodes", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* sortP = icl_create_kernel(dev, "kernel/sortParticlesToChilds.cl", "sortParticlesToChilds", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* snf = icl_create_kernel(dev, "kernel/smallNodeFiltering.cl", "smallNodeFiltering", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* pnl = icl_create_kernel(dev, "kernel/packNextlist.cl", "packNextlist", KERNEL_BUILD_MACRO, ICL_SOURCE);	

	//////////////////////////////////////////////////////////////////////////
	icl_kernel* preScan  = icl_create_kernel(dev, "kernel/sortP_prescan.cl", "sortP_prescan_chunked", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* postScan = icl_create_kernel(dev, "kernel/sortP_postscan.cl", "sortP_postscan_chunked", KERNEL_BUILD_MACRO, ICL_SOURCE);
	segmented_scan_init(nParticles, dev, KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* memset_int_s  = icl_create_kernel(dev, "kernel/init.cl", "memset_int_s", KERNEL_BUILD_MACRO, ICL_SOURCE);

	// approach with segmented scan
	icl_buffer *scan_data = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(int) * nParticles);
	icl_buffer *scan_flag = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(int) * nParticles);
	icl_buffer* buffered_particles = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Particle) * nParticles);


#if timing == 1
	icl_timer* timer_gp2c =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_cBBox =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_bBox =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_sln =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_sortP =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_snf =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_pnl =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_ran =  icl_init_timer(ICL_MILLI);


	icl_timer* timer_prescan =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_scan =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_postscan =  icl_init_timer(ICL_MILLI);

#endif
	//add root node to the activelist and initialize size lists
	icl_run_kernel(init, 1, &globalSize, &localSize, NULL, NULL, 3,
			(size_t)0, (void *)activelist,
			(size_t)0, (void *)sizes,
			(size_t)0, (void *)particlesD);

	UINT activeN = 1;

	icl_finish(dev);

	// smallest power of 2 bigger or equal to maxxNchnunks
	UINT pow2maxNchunks = pow2roundup(maxNchunks);
	// processLargeNode
	while(activeN != 0)
	{

		icl_start_timer(timer);
		// group triangles into chunks
		size_t localSize1 = min(pow2maxNchunks, 256);

#if timing == 1
		icl_start_timer(timer_gp2c);
#endif
		size_t globalSize1 = ((maxNchunks + localSize1 -1) / localSize1) * localSize1;
		// reset chunks
		icl_run_kernel(resetChunks, 1, &globalSize1, &localSize1, NULL, NULL, 2,
				(size_t)0, (void *)chunks,
				sizeof(UINT), &maxNchunks);

		globalSize1 = localSize1 * activeN;
		// split every node in chunk of chunk_size
		icl_run_kernel(gp2c, 1, &globalSize1, &localSize1, NULL, NULL, 4,
				(size_t)0, (void *)nodelist,
				(size_t)0, (void *)activelist,
				(size_t)0, (void *)chunks,
				sizeof(UINT), &activeN);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_gp2c);
#endif

		// compute per chunk bounding box
		size_t localSize2 = chunk_size;
		size_t globalSize2 = maxNchunks * chunk_size;
#if timing == 1
		icl_start_timer(timer_cBBox);
#endif

		icl_run_kernel(cBBox, 1, &globalSize2, &localSize2, NULL, NULL, 5,
				(size_t)0, (void *)nodelist,
				(size_t)0, (void *)activelist,
				(size_t)0, (void *)particlesD,
				(size_t)0, (void *)chunks,
				(size_t)0, (void *)bboxes);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_cBBox);
#endif

		// compute per node bounding box
		size_t localSize3 = min(pow2maxNchunks, 256);
		size_t globalSize3 = localSize3 * activeN;
#if timing == 1
		icl_start_timer(timer_bBox);
#endif
		icl_run_kernel(bBox, 1, &globalSize3, &localSize3, NULL, NULL, 4,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)activelist,
						(size_t)0, (void *)bboxes,
						sizeof(UINT), &activeN);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_bBox);
#endif

		// split large nodes
		size_t localSize4 = 256;
		size_t globalSize4 = ((activeN + 255) / 256) * 256;
#if timing == 1
		icl_start_timer(timer_sln);
#endif
		icl_run_kernel(sln, 1, &globalSize4, &localSize4, NULL, NULL, 5,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)activelist,
						(size_t)0, (void *)nextlist,
						(size_t)0, (void *)sizes,
						sizeof(UINT), &activeN);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_sln);
#endif

		///////////////////////////////////////////////////////////////////////////////
		// XXx replaced with segmented scan

		//globalSize = (activeN+1) * 256;
#if timing == 1
		icl_start_timer(timer_sortP);
#endif


#if DEVICE == ICL_CPU 
		// sort particles to child nodes
		size_t localSize5 = 256;
		size_t globalSize5 = ((activeN + 255) / 256) * 256;
		icl_run_kernel(sortP, 1, &globalSize5, &localSize5, NULL, NULL, 5,
			(size_t)0, (void *)nodelist,
			(size_t)0, (void *)particlesD,
			(size_t)0, (void *)activelist,
			(size_t)0, (void *)nextlist,
			sizeof(UINT), &activeN
		);
#else		
		// init scan_flag to 1
		cl_int initFlag = 1;
		size_t np = (size_t)((nParticles + localSize4 -1 ) / localSize4) * localSize4;
		icl_run_kernel(memset_int_s, 1, &np, &localSize4, NULL, NULL, 3,
			(size_t)0, (void *)scan_flag,
			sizeof(cl_int), &initFlag,
			sizeof(UINT), &nParticles
		);

#if timing == 1
		icl_start_timer(timer_prescan);
#endif
		// pre-scan fills data0 and data1 with 1 and 0 whenever value < pivot
		localSize = chunk_size;
//		globalSize = activeN * 256;
		globalSize = maxNchunks * chunk_size;
		icl_run_kernel(preScan, 1, &globalSize, &localSize, NULL, NULL, 7,
			(size_t)0, (void *)nodelist,
			(size_t)0, (void *)chunks,
			(size_t)0, (void *)particlesD,
			(size_t)0, (void *)activelist,
			sizeof(UINT), &activeN,
			(size_t)0, (void *)scan_data,
			(size_t)0, (void *)scan_flag						
		);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_prescan);
#endif

#if timing == 1
		icl_start_timer(timer_scan);
#endif
		// scan for
		scan(scan_data, scan_flag, nParticles);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_scan);
#endif
		// copy partially sorted data to the final
		icl_copy_buffer(particlesD, buffered_particles, sizeof(struct Particle) * nParticles, NULL, NULL);
//		swap(particlesD, buffered_particles);
#if timing == 1
		icl_start_timer(timer_postscan);
#endif

		localSize = chunk_size;
		globalSize = maxNchunks * chunk_size;
		icl_run_kernel(postScan, 1, &globalSize, &localSize, NULL, NULL, 7,
			(size_t)0, (void *)nodelist,
			(size_t)0, (void *)chunks,
			(size_t)0, (void *)particlesD,
			(size_t)0, (void *)activelist,
			sizeof(UINT), &activeN,

			(size_t)0, (void *)buffered_particles,
			(size_t)0, (void *)scan_data						
		);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_postscan);
#endif

icl_finish(dev);
#endif
		///////////////////////////////////////////////////////////////////////////////

#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_sortP);
#endif
		
		// small node filtering
		size_t localSize6 = 256;
		size_t globalSize6 = ((activeN*2 + 255) / 256) * 256;
#if timing == 1
		icl_start_timer(timer_snf);
#endif
		icl_run_kernel(snf, 1, &globalSize6, &localSize6, NULL, NULL, 4,
			(size_t)0, (void *)nodelist,
			(size_t)0, (void *)nextlist,
			(size_t)0, (void *)smalllist,
			(size_t)0, (void *)sizes
			//, sizeof(UINT), &nParticles
		);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_snf);
#endif

		// packing of nextlist
		size_t localSize7 = 1;
		size_t globalSize7 = 1;
#if timing == 1
		icl_start_timer(timer_pnl);
#endif
		icl_run_kernel(pnl, 1, &globalSize7, &localSize7, NULL, NULL, 2,
			(size_t)0, (void *)nextlist,
			(size_t)0, (void *)sizes
		);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_pnl);
#endif

		// swap nextlist and activelist
		swap(&nextlist, &activelist);

#if timing == 1
		icl_start_timer(timer_ran);
#endif
		// read size of next activelist set in kernel
		icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL);
		icl_finish(dev);
#if timing == 1
		icl_stop_timer(timer_ran);
#endif

		++level;
//printf("%d: ActiveN %d\n", level, activeN);
		time = icl_stop_timer(timer);

	}

	icl_release_kernel(init);
	icl_release_kernel(gp2c);
	icl_release_kernel(cBBox);
	icl_release_kernel(bBox);
	icl_release_kernel(sln);
	icl_release_kernel(sortP);
	icl_release_kernel(snf);
	icl_release_kernel(pnl);
//////////////////////////////////////////////////////////////////////////
	icl_release_kernel(preScan);
	icl_release_kernel(postScan);
	segmented_scan_release();

	icl_release_buffers(3, scan_data, scan_flag, buffered_particles);

#if timing == 1
	printf("gp2c %f\ncBBox %f\nbBox %f\nsln  %f\nsortP %f\nsnf %f\npnl %f\nran %f\n\n",
			timer_gp2c->current_time,
			timer_cBBox->current_time,
			timer_bBox->current_time,
			timer_sln->current_time,
			timer_sortP->current_time,
			timer_snf->current_time,
			timer_pnl->current_time,
			timer_ran->current_time);
	icl_release_timer(timer_gp2c);
	icl_release_timer(timer_cBBox);
	icl_release_timer(timer_bBox);
	icl_release_timer(timer_sln);
	icl_release_timer(timer_sortP);
	icl_release_timer(timer_snf);
	icl_release_timer(timer_pnl);

	printf("prescan %f\nscan %f\npostscan %f\n\n", timer_prescan->current_time, timer_scan->current_time, timer_postscan->current_time);

	icl_release_timer(timer_prescan);
	icl_release_timer(timer_scan);
	icl_release_timer(timer_postscan);
#endif

/*
icl_read_buffer(nodelist, CL_TRUE, sizeof(struct Node) * 6000, tree->nodelist, NULL, NULL);
printf("node: %d, left %d, right %d", tree->nodelist[0].particlesHigh - tree->nodelist[0].particlesLow,
		tree->nodelist[1].particlesHigh - tree->nodelist[1].particlesLow, tree->nodelist[2].particlesHigh - tree->nodelist[2].particlesLow);

printBox(tree->nodelist[49].bounding_box);
printBox(tree->nodelist[53].bounding_box);
printBox(tree->nodelist[54].bounding_box);


for(int i = 0; i < 6000; ++i)
	if(tree->nodelist[i].bounding_box.box[0].x != 0.0)
		printBox(tree->nodelist[i].bounding_box);
*/
	//small nodes stage
//	preprocessSmallNodes(smalllist);
	icl_release_buffers(3, activelist, chunks, bboxes);
	icl_kernel* sasl = icl_create_kernel(dev, "kernel/swapActiveAndSmalllist.cl", "swapActiveAndSmalllist", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* ssn = icl_create_kernel(dev, "kernel/splitSmallNodes.cl", "splitSmallNodes", KERNEL_BUILD_MACRO, ICL_SOURCE);

#if timing == 1
	icl_timer* timer_ssn = icl_init_timer(ICL_MILLI);
	icl_timer* timer_sasl = icl_init_timer(ICL_MILLI);
	icl_timer* timer_rsn = icl_init_timer(ICL_MILLI);
#endif

	size_t localSize8 = 1;
	size_t globalSize8 = 1;
	UINT setMaxLevel = 0;
	icl_run_kernel(sasl, 1, &globalSize8, &localSize8, NULL, NULL, 2,
					(size_t)0, (void *)sizes,
					sizeof(UINT), &level);
	// get number of small nodes
	icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL);

	while(activeN != 0)
	{
		icl_start_timer(timer);
		// compute SVH and determine the split plane
		size_t localSize9 = 256;
		size_t globalSize9 = ((activeN + 255) / 256) * 256;
#if timing == 1
		icl_start_timer(timer_ssn);
#endif
		icl_run_kernel(ssn, 1, &globalSize9, &localSize9, NULL, NULL, 5,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)smalllist,
						(size_t)0, (void *)nextlist,
						(size_t)0, (void *)particlesD,
						(size_t)0, (void *)sizes);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_ssn);
#endif

		size_t localSizeA = 1;
		size_t globalSizeA = 1;
#if timing == 1
		icl_start_timer(timer_sasl);
#endif
		icl_run_kernel(sasl, 1, &globalSizeA, &localSizeA, NULL, NULL, 2,
						(size_t)0, (void *)sizes,
						sizeof(UINT), &setMaxLevel);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_sasl);
#endif

		swap(&nextlist, &smalllist);

		// read size of next activelist set in kernel
#if timing == 1
		icl_start_timer(timer_rsn);
#endif
		icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL);
//printf("small size %d\n", activeN);
		icl_finish(dev);
#if timing == 1
		icl_stop_timer(timer_rsn);
#endif
		time = icl_stop_timer(timer);
	}

	icl_release_buffer(smalllist);
	icl_release_buffer(nextlist);

	icl_release_kernel(sasl);
	icl_release_kernel(ssn);

#if timing == 1
	printf("ssn %f\nsasl %f\nrsn %f\n\n", timer_ssn->current_time, timer_sasl->current_time, timer_rsn->current_time);
	icl_release_timer(timer_ssn);
	icl_release_timer(timer_sasl);
	icl_release_timer(timer_rsn);
#endif

	UINT s[5];
	icl_read_buffer(sizes, CL_TRUE, sizeof(UINT) * 5, &s, NULL, NULL);
	icl_release_buffer(sizes);

	icl_kernel* upPass = icl_create_kernel(dev, "kernel/upPass.cl", "upPass", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* downPass = icl_create_kernel(dev, "kernel/kdDownPass.cl", "downPass", KERNEL_BUILD_MACRO, ICL_SOURCE);

#if timing == 1
	icl_timer* timer_upPass = icl_init_timer(ICL_MILLI);
	icl_timer* timer_downPass = icl_init_timer(ICL_MILLI);
	icl_timer* timer_rt = icl_init_timer(ICL_MILLI);
#endif

	UINT treeHeight = s[4];
	printf("Tree height: %d\n", treeHeight);

	size_t localSizeB = 256;
	size_t globalSizeB = ((nNodes + 255) / 256) * 256;
	icl_start_timer(timer);
#if timing == 1
		icl_start_timer(timer_upPass);
#endif

	for(int l = (int)treeHeight; l >= 0; --l)
	{
		icl_run_kernel(upPass, 1, &globalSizeB, &localSizeB, NULL, NULL, 4,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)particlesD,
						sizeof(int), &l,
						sizeof(UINT), &nNodes);
	}
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_upPass);
		icl_start_timer(timer_downPass);
#endif
	for(UINT l = 0; l <= treeHeight; ++l)
	{
		icl_run_kernel(downPass, 1, &globalSizeB, &localSizeB, NULL, NULL, 4,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)treeD,
						sizeof(UINT), &l,
						sizeof(UINT), &nNodes);
	}
	icl_finish(dev);
#if timing == 1
	icl_stop_timer(timer_downPass);
#endif
	time = icl_stop_timer(timer);

	icl_release_kernel(upPass);
	icl_release_kernel(downPass);

#if timing == 1
	icl_start_timer(timer_rt);
#endif

#if timing == 1
	icl_finish(dev);
	icl_stop_timer(timer_rt);

	printf("upPass %f\ndownPass %f\nread Tree %f\n\n", timer_upPass->current_time, timer_downPass->current_time, timer_rt->current_time);
	icl_release_timer(timer_upPass);
	icl_release_timer(timer_downPass);
	icl_release_timer(timer_rt);
#endif

	//	struct Node* kdTree = (struct Node*)malloc(sizeof(struct Node) * nNodes);
//	icl_read_buffer(treeD, CL_TRUE, sizeof(struct Node) * nNodes, kdTree, NULL, NULL);
//	printf("%d", tree->nodelist[0].left_child);

	printf("\nTime: %f\n", time);
	icl_release_timer(timer);

	return treeHeight;
}

Пример #11

0

Показать файл

Файл: gpukdtree.c Проект: klois/gpukdtree

int main (int argc, char **argv)
{
	struct BBox box;
	struct Particle *particles;
	particle_data *P;
	io_header header;
	int tot = snapshotLoader(argv[1], &header, &P);
	int k = 0;

	if(tot <= 0)
	{
		printf("error while loading snapshot file\n");
		return -1;
	}

	initBBox2(&box);

	particles = (struct Particle*)malloc(header.npartTotal[1] * sizeof(struct Particle));
	UINT* particleIds = (UINT*)malloc(header.npartTotal[1] * sizeof(UINT));
	//for(int j = header.npartTotal[0]; j < header.npartTotal[0]+header.npartTotal[1]; ++j)
#define F 1
	for(int j = header.npartTotal[0]; j < header.npartTotal[0]+header.npartTotal[1]; j += F)
	{
		particles[k].pos.x = P[j].Pos[0];
		particles[k].pos.y = P[j].Pos[1];
		particles[k].pos.z = P[j].Pos[2];
		particles[k].vel.x = P[j].Vel[0];
		particles[k].vel.y = P[j].Vel[1];
		particles[k].vel.z = P[j].Vel[2];
		particles[k].mass = P[j].Mass;
		particles[k].id = P[j].Id;
		particleIds[k] = P[j].Id;

		//printf("%f %f %f\n",  particles[k].pos.x, particles[k].pos.y, particles[k].pos.z);

		//get bbox
/*		
		if(particles[k].pos.x < box.box[0].x)
			box.box[0].x = particles[k].pos.x;

		if(particles[k].pos.y < box.box[0].y)
			box.box[0].y = particles[k].pos.y;

		if(particles[k].pos.z < box.box[0].z)
			box.box[0].z = particles[k].pos.z;

		if(particles[k].pos.x >= box.box[1].x)
			box.box[1].x = particles[k].pos.x;

		if(particles[k].pos.y >= box.box[1].y)
			box.box[1].y = particles[k].pos.y;

		if(particles[k].pos.z >= box.box[1].z)
			box.box[1].z = particles[k].pos.z;
*/
		++k;
	}

	free(P);
	
	header.npartTotal[1] /= F;
	struct Tree tree;
	tree.nodelist = (struct Node*)malloc(2*header.npartTotal[1] * sizeof(struct Node));
	struct Particle* ref = (struct Particle*)malloc(sizeof(struct Particle) * header.npartTotal[1]);

	// init ocl
	icl_init_devices(DEVICE);

	if (icl_get_num_devices() != 0)
	{
		icl_device* dev = icl_get_device(0);

		icl_print_device_short_info(dev);

		icl_buffer* particlesD = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Particle) * header.npartTotal[1]);

		// copy particles to ocl device
		icl_write_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL);
		
		
		run(1, 0.00001, 0.0025, particles, particlesD, header.npartTotal[1], dev, &tree, ref);
		
		
		
//		icl_buffer* kdTree = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct KdNode) * (header.npartTotal[1] * 2 - 1));
//		buildTree(tree.nodelist, particlesD, kdTree, header.npartTotal[1], dev);
 
 		// TODO particles have been resorted during tree construction. Upload them in original sorting for comparison, not needed for correctness REMOVE IT!
//		icl_write_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL);

// 		walk(kdTree, particlesD, header.npartTotal[1], 0.00001, dev);
 
// 		// read particles from device, used as reference for correctness check
// 		icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &ref[0], NULL, NULL);
// 
// 		printf("Walk second time with last acceleration of particles\n");
// 
// 		walk(kdTree, particlesD, header.npartTotal[1], 0.00001, dev, particles);
// 
// 		// read particles from device
 		icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL);
//
//		icl_release_buffer(kdTree);
		icl_release_buffer(particlesD);
		icl_release_devices();

		printf("\nSUCCESS\n");
	} else {
		printf("ERROR! Cannot find requested device\n");
		return -1;
	}

//	check_force("forcetest_1e5.txt", "result.txt", particles, particleIds, header.npartTotal[1]);
//	check_force("forcetest.txt", "result.txt", particles, particleIds, header.npartTotal[1]);
#if timing == 1
	check_force_internal("result.txt", ref, particles, particleIds, header.npartTotal[1]);
#endif

// display interactions, stored in each particle at acc.x
/*FLOAT average = 0;
for(UINT i = 0; i < header.npartTotal[1]; ++i) {
	average += particles[i].acc.x;
}
printf("Average number of interactions: %f\n", average/header.npartTotal[1]);
*/
	free(tree.nodelist);
	free(particles);
	free(ref);
	

	return 0;
}

Пример #12

0

Показать файл

Файл: ocl_jacsolver.c Проект: 8l/insieme

/**************************************************************************
 Function: ocl_jacobi

  This routine contains the main iteration loop for the Jacobi iteration
  using OpenCL kernel.

 params:
    a                       two arrays to compute solution into
    max_iter                maximum number of iterations
    size                    size of array for this MPI rank
    tolerance               all differences should be les than this tolerance value
    mpi_ranks               number of MPI ranks in each dimension
    rank_pos                cartesian position of this rank
    origin                  origin for this rank
    d                       discretion size
    mpi_comm                MPI communications structure
    local_workblock_size    size of local workblock for OpenCL kernel
    device_type             OpenCL device type
    full_copy               boolean if full buffer copy is to be done
 **************************************************************************/
static void ocl_jacobi(value_type *a[2],
                        unsigned int max_iter,
                        size_t size[DIMENSIONS],
                        value_type tolerance,
                        value_type d[DIMENSIONS],
                        size_t local_workblock_size[DIMENSIONS],
                        cl_device_type device_type,
                        unsigned int full_copy) {

    size_t array_size;
    unsigned int i, j, rc, iter = 0;
    size_t delta_buffer_size, delta_size[DIMENSIONS];
    size_t tile_delta_size, tile_cache_size;
    value_type max_diff, timer;
    icl_device* device_id;
    icl_kernel* kernel;
    cl_int err;
    icl_buffer *a_buf[2], *delta_buf;
    value_type *delta;
 
    /* convenience for y stride in array */
    cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH;
    
    /* init devices */
    icl_init_devices(device_type);
    
    /* find OpenCL device */
    device_id  = icl_get_device(0);


    /* build the kernel and verify the kernel */
    kernel = icl_create_kernel(device_id, "jacsolver_kernel.cl", "ocl_jacobi_local_copy", "", ICL_SOURCE);

    /* calculate size of kernel local memory  - also used later for kernel params */
    tile_delta_size = local_workblock_size[X] * local_workblock_size[Y];
    tile_cache_size = (local_workblock_size[X]+2*GHOST_CELL_WIDTH) * (local_workblock_size[Y]+2*GHOST_CELL_WIDTH);

    /* verify the device has enough resources for this device */
/*  I'm an optimist, we just hope for the best
  	if ((cluGetAvailableLocalMem(device_id, kernel) < tile_delta_size + tile_cache_size) ||
        (! cluCheckLocalWorkgroupSize(device_id, kernel, DIMENSIONS, local_workblock_size))) {
        local_workblock_size[X] = 1;
        local_workblock_size[Y] = 1;
    }
*/
    printf("Estimating solution using OpenCL Jacobi iteration with %d x %d workblock.\n", (int)local_workblock_size[X], (int)local_workblock_size[Y]);
    fflush(stdout);

    /* init arrays by setting the initial value and the boundary conditions */
    set_initial_solution(a[OLD], size, INITIAL_GUESS);
    set_initial_solution(a[NEW], size, INITIAL_GUESS);
    set_boundary_conditions(a[OLD], size, d);
    set_boundary_conditions(a[NEW], size, d);

    /* print the initial solution guess */ 
    print_array("Init ", a[NEW], size, d);

    /* allocate memory for differences */
    delta_size[X] = size[X] / local_workblock_size[X];
    delta_size[Y] = size[Y] / local_workblock_size[Y];
    delta_buffer_size = delta_size[X] * delta_size[Y];
    delta = (value_type *)malloc(sizeof(value_type) * delta_buffer_size);
    
    /* initialize deltas so that first execution of kernel with overlapping 
     * reduction on the host will work correctly and not prematurely exit
     */
    for (i=0; i<delta_size[X]; ++i) {
        for (j=0; j<delta_size[Y]; ++j) {
            delta[i * delta_size[Y] + j] = 1.0;
        }
    }

    /* create buffers for OpenCL device using host memory */
    array_size = (size[X]+2*GHOST_CELL_WIDTH) * ystride;
    a_buf[OLD] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size);
    a_buf[NEW] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size);
    delta_buf = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * delta_buffer_size);

    /* copy over buffers to device */
    icl_write_buffer(a_buf[OLD], CL_TRUE, sizeof(value_type) * array_size, a[OLD], NULL, NULL);
    icl_write_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL);

    /* set the kernel execution type  - data parallel */
 //   cluSetKernelNDRange(clu, kernel, DIMENSIONS, NULL, size, local_workblock_size);

    /*  iterate until maximum difference is less than the given tolerance
        or number of iterations is too high */
    do {
        /* swap array pointers for next iteration */
        SWAP_PTR(a[OLD], a[NEW]);
        SWAP_BUF(a_buf[OLD], a_buf[NEW]);
        icl_run_kernel(kernel, DIMENSIONS, size, local_workblock_size, NULL, NULL, 6,
                    (size_t)0,(void *) a_buf[OLD],
                    (size_t)0, (void *) a_buf[NEW],
                    sizeof(value_type) * tile_delta_size, NULL,
                    sizeof(value_type) * tile_cache_size, NULL,
                    (size_t)0, (void *) delta_buf,
                    sizeof(cl_uint), (void *) &ystride);

        /* while the kernel is running, calculate the reduction for the previous iteration */
        max_diff = ocl_jacobi_reduce(delta, delta_size);
        
        /* enqueue a synchronous copy of the delta. This will not occur until the kernel 
         * has finished. The deltas for each workgroup is a much smaller array to process
         */
        icl_read_buffer(delta_buf, CL_TRUE, sizeof(value_type) * delta_buffer_size, delta, NULL, NULL);
//        clEnqueueReadBuffer(queue, a_buf[NEW], CL_TRUE,    0, sizeof(value_type) * array_size, a[NEW], 0, NULL, NULL));

        /* output status for user, overwrite the same line */
        if ((0 == iter % 100)) {
            printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r",
                        iter, max_diff, tolerance);
            fflush(stdout);
        }

        
        /* increment the iteration counter */
        iter++;
    } while (max_diff > tolerance && max_iter >= iter); /* do loop */

    /* read back the final result */
    icl_read_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL);

    /* output final iteration count and maximum difference value */
    printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n", iter-1, max_diff, timer);
    fflush(stdout);

    /* finish usage of OpenCL device */
    icl_release_buffers(3, a_buf[OLD], a_buf[NEW], delta_buf);
    icl_release_kernel(kernel);
    free(delta);
}

Пример #13

0

Показать файл

Файл: knn.c Проект: 8l/insieme

int main(int argc, const char* argv[]) {
        icl_args* args = icl_init_args();
        icl_parse_args(argc, argv, args);
        icl_print_args(args);

	chdir(PATH);

//	int dim = 128;
	int size = args->size;
	int nRef = 100000;

	float* ref = (float*)malloc(sizeof(float) * nRef /* dim*/);
	float* query = (float*)malloc(sizeof(float) * size /* dim*/);
	float* dists = (float *)malloc(sizeof(float) * size);
	int* neighbors = (int*)malloc(sizeof(int) * size);
	srand(42);
	
	for(int i=0; i < nRef/*dim*/; ++i) {
		ref[i] = rand();
	}
	for(int i=0; i < size/**dim*/; ++i) {
		query[i] = rand();
	}

	icl_init_devices(ICL_ALL);
	
	icl_start_energy_measurement();

	if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(0);

		icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "knn.cl", "knn", "", ICL_SOURCE);
		
		size_t szLocalWorkSize = args->local_size;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;

		for (int i = 0; i < args->loop_iteration; ++i) {
			icl_buffer* buf_ref = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * nRef /* dim*/);
			icl_buffer* buf_query = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size /* dim*/);
			icl_buffer* buf_dists = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size);
			icl_buffer* buf_neighbors = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size);

			icl_write_buffer(buf_ref, CL_TRUE, sizeof(float) * nRef /*dim*/, &ref[0], NULL, NULL);
			icl_write_buffer(buf_query, CL_TRUE, sizeof(float) * size /*dim*/, &query[0], NULL, NULL);
		
			icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6,
												(size_t)0, (void *)buf_ref,
												(size_t)0, (void *)buf_query,
												(size_t)0, (void *)buf_dists,
												(size_t)0, (void *)buf_neighbors,
												sizeof(cl_int), (void *)&nRef,
												sizeof(cl_int), (void *)&size);
	//											sizeof(cl_int), (void *)&dim);
		
			icl_read_buffer(buf_dists, CL_TRUE, sizeof(float) * size, &dists[0], NULL, NULL);
			icl_read_buffer(buf_neighbors, CL_TRUE, sizeof(int) * size, &neighbors[0], NULL, NULL);
			icl_release_buffers(4, buf_ref, buf_query, buf_dists, buf_neighbors);
		}
		
		icl_release_kernel(kernel);
	}
	
	icl_stop_energy_measurement();
	
	if (args->check_result) {
		printf("======================\n= KNN program working\n");
		unsigned int check = 1;
		unsigned int sum = 0;
		for(int i = 0; i < size; ++i) {
			if(dists[i] < 0)
				check = 0;
			if(neighbors[i] < 0 || neighbors[i] >= nRef)
				check = 0;
		}
		
		printf("======================\n");
		printf("Result check: %s\n", check ? "OK" : "FAIL");
	} else {
		printf("Result check: OK\n");
	}

	icl_release_args(args);	
	icl_release_devices();
	free(ref);
	free(query);
	free(dists);
	free(neighbors);
}

Пример #14

0

Показать файл

Файл: reduction_chunking.c Проект: 8l/insieme

int main(int argc, const char* argv[]) 
{
	chdir(PATH);

	icl_args* args = icl_init_args();
	icl_parse_args(argc, argv, args);
        icl_print_args(args);

	int size = args->size;

	// this is the size of chunking - so far as big as the local size	
	int chunkSize = 16;

	cl_float16* input = (cl_float16*)malloc(sizeof(cl_float16) * size);
	float* output = (float*)malloc(sizeof(float) * size);
	
	fillrandom_float((float*)input,size, chunkSize, 0.001f ,100000.f);

    icl_init_devices(args->device_type);

	icl_start_energy_measurement();

	if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(args->device_id);

        icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "reduction_chunking.cl", "reduce", "", ICL_SOURCE);
		
		size_t szLocalWorkSize = args->local_size;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;
		
		for (int i = 0; i < args->loop_iteration; ++i) {
			icl_buffer* buf_input = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_float16) * size);
			icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size);

			icl_write_buffer(buf_input, CL_FALSE, sizeof(cl_float16) * size, &input[0], NULL, NULL);


			icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 4,
				(size_t)0, (void *)buf_input,
				(size_t)0, (void *)buf_output,
				sizeof(cl_int), (void *)&chunkSize,
				sizeof(cl_int), (void *)&size
			);

			icl_read_buffer(buf_output, CL_TRUE, sizeof(float) * size, &output[0], NULL, NULL);
			icl_release_buffers(2, buf_input, buf_output);
		}
		
		icl_release_kernel(kernel);
	}
	
	icl_stop_energy_measurement();

//	printf("Chunks' minimum \n");
//	out_float_hbuffer(output, size);

	if (args->check_result) {
		printf("======================\n= Reduction test\n");
		unsigned int check = 1;
		float host_min = 100000.f;
		float* testInput = (float*)input;
		for(unsigned int i = 0; i < size*chunkSize; ++i) 
			if(testInput[i] < host_min)	host_min = testInput[i];
		printf("Host minimum is %f\n", host_min);

		float device_min = 100000.f;
		for(unsigned int i = 0; i < size; ++i) 
			if(output[i] < device_min)	device_min = output[i];			
		printf("Device minimum is %f\n", device_min);		

		printf("Result check: %s\n", (device_min == host_min) ? "OK" : "FAIL");
	} else {
		printf("Result check: OK\n");
	}
	
	icl_release_devices();
	free(input);
	free(output);

	#ifdef _MSC_VER
	icl_prompt();
	#endif

	return 0;
}

Пример #15

0

Показать файл

Файл: lin_reg.c Проект: 8l/insieme

int main(int argc, const char* argv[]) {
    icl_args* args = icl_init_args();
    icl_parse_args(argc, argv, args);
    icl_print_args(args);

	chdir(PATH);

    int size = args->size;

    float* input1 = (float*) malloc(sizeof(float) * size);
    float* input2 = (float*) malloc(sizeof(float) * size);
    float* alpha  = (float*) malloc(sizeof(float) * size);
    float* beta   = (float*) malloc(sizeof(float) * size);
    float* output = (float*) malloc(sizeof(float) * size);

    fill_random_float(input2, size, 1, -1.0f, 1.0f);
    qsort(input2, size, sizeof(float), float_compare);
    float step = 2.0f / size;
    for(int i=0; i < size; i++) 
	input1[i] = -1.0f + i * step;

    fill_random_float(alpha, size, 1, -1.0f, 1.0f);
    fill_random_float(beta, size, 1, -1.0f, 1.0f);

    icl_init_devices(args->device_type);

	icl_start_energy_measurement();

    if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(args->device_id);

		icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "lin_reg.cl", "lin_reg", "", ICL_SOURCE);

		size_t szLocalWorkSize = args->local_size;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;

		for (int i = 0; i < args->loop_iteration; ++i) {
			icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY,  sizeof(float) * size);
			icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY,  sizeof(float) * size);
			icl_buffer* buf_alpha  = icl_create_buffer(dev, CL_MEM_READ_ONLY,  sizeof(float) * size);
			icl_buffer* buf_beta   = icl_create_buffer(dev, CL_MEM_READ_ONLY,  sizeof(float) * size);
			icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size);

			icl_write_buffer(buf_input1, CL_TRUE, sizeof(float) * size, &input1[0], NULL, NULL);
			icl_write_buffer(buf_input2, CL_TRUE, sizeof(float) * size, &input2[0], NULL, NULL);
			icl_write_buffer(buf_alpha, CL_TRUE, sizeof(float) * size, &alpha[0], NULL, NULL);
			icl_write_buffer(buf_beta, CL_TRUE, sizeof(float) * size, &beta[0], NULL, NULL);

			icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6,
												(size_t)0, (void *)buf_input1,
												(size_t)0, (void *)buf_input2,
			                								(size_t)0, (void *)buf_alpha,
											                (size_t)0, (void *)buf_beta,
												(size_t)0, (void *)buf_output,
												sizeof(cl_int), (void *)&size);
			icl_read_buffer(buf_output, CL_TRUE, sizeof(float) * size, &output[0], NULL, NULL);
			icl_release_buffers(5, buf_input1, buf_input2, buf_alpha, buf_beta, buf_output);
		}

		icl_release_kernel(kernel);
	}
	
	icl_stop_energy_measurement();

    if (args->check_result) {
        printf("======================\n= Linear Regression Done\n");
		float* output2 = (float *)malloc(sizeof(float) * size);
		for(unsigned int j = 0; j < size; ++j) {
			const int gid = j;
			float a = alpha[gid];
			float b = beta[gid];
			float error = 0;
			for(int i=0; i<size; i++) {
				float e = (a * input1[i] + b) - input2[i];
				error += e * e;
			}
			output2[gid] = error;
		}
                
	    bool check = compare_float(output, output2, size, 0.000001);
		printf("======================\n");
		printf("Result check: %s\n", check ? "OK" : "FAIL");
		free(output2);
    } else {
		printf("Result check: OK\n");
    }

	icl_release_args(args);
	icl_release_devices();
	free(input1);
	free(input2);
	free(alpha);
	free(beta);
	free(output);
}

Пример #16

0

Показать файл

Файл: simple.c Проект: 8l/insieme

int main(int argc, const char* argv[]) {
        icl_args* args = icl_init_args();
        icl_parse_args(argc, argv, args);
        icl_print_args(args);

	chdir(PATH);

	int size = args->size;

	int* input = (int*)malloc(sizeof(int) * size);
	int* output = (int *)malloc(sizeof(int) * size);
	
	for(int i=0; i < size; ++i) {
		input[i] = i;
	}

	icl_init_devices(ICL_ALL);
	
	icl_start_energy_measurement();

	if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(0);

		icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "simple.cl", "simple", "", ICL_SOURCE);
	
		size_t szLocalWorkSize = args->local_size;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;
	
		for (int i = 0; i < args->loop_iteration; ++i) {
			icl_buffer* buf_input = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size);
			icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size);

			icl_write_buffer(buf_input, CL_TRUE, sizeof(int) * size, &input[0], NULL, NULL);

			icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 3,
												(size_t)0, (void *)buf_input,
												(size_t)0, (void *)buf_output,
												sizeof(cl_int), (void *)&size);
		
			icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], NULL, NULL);
			icl_release_buffers(2, buf_input, buf_output);
		}
		
		icl_release_kernel(kernel);
	}
	
	icl_stop_energy_measurement();

	if (args->check_result) {
		printf("======================\n= Simple program working\n");
		unsigned int check = 1;
		for(unsigned int i = 0; i < size; ++i) {
			if(output[i] != input[i]) {
				check = 0;
				printf("= fail at %d, expected %d / actual %d", i, i, output[i]);
				break;
			}
		}
		printf("======================\n");
		printf("Result check: %s\n", check ? "OK" : "FAIL");
	} else {
		
                printf("Result check: OK\n");
	}

	icl_release_args(args);	
	icl_release_devices();
	free(input);
	free(output);
}

Пример #17

0

Показать файл

Файл: vec_mul.c Проект: 8l/insieme

int main(int argc, char* argv[]) {
	int size = 1000;

	int* input1 = (int*)malloc(sizeof(int) * size);
	int* input2 = (int*) malloc(sizeof(int) * size);
	int* output = (int *)malloc(sizeof(int) * size);
	
	for(int i=0; i < size; ++i) {
		input1[i] = i;
		input2[i] = 1;
	}

#ifndef INSIEME
	icl_timer* time1 = icl_init_timer(ICL_SEC);
	icl_start_timer(time1);
#endif
	icl_init_devices(ICL_CPU);
#ifndef INSIEME
	printf("TIME for initialization: %f\n", icl_stop_timer(time1));
#endif
	
	if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(0);

		icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "vec_mul.cl", "vec_mul", "", ICL_SOURCE);
		
		icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size);
		icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size);
		icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size);

		icl_event* wb1 = icl_create_event();
		icl_event* wb2 = icl_create_event();
		icl_event* rb = icl_create_event();

		icl_write_buffer(buf_input1, CL_FALSE, sizeof(int) * size, &input1[0], NULL, wb1);
		icl_write_buffer(buf_input2, CL_FALSE, sizeof(int) * size, &input2[0], NULL, wb2);
		
		size_t szLocalWorkSize = 256;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;

		icl_event* rk = icl_create_event();
		icl_event* wb_all = icl_create_event_list(2, wb1, wb2);	
		icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, wb_all, rk, 4,
											(size_t)0, (void *)buf_input1,
											(size_t)0, (void *)buf_input2,
											(size_t)0, (void *)buf_output,
											sizeof(cl_int), (void *)&size);
		
		icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], rk, rb);
		
		printf("Time wb1 %f\n", icl_profile_event(wb1, ICL_STARTED, ICL_FINISHED, ICL_SEC));		
		printf("Time wb2 %f\n", icl_profile_event(wb2, ICL_STARTED, ICL_FINISHED, ICL_SEC));
		printf("Time rk %f\n", icl_profile_event(rk, ICL_STARTED, ICL_FINISHED, ICL_SEC));
		printf("Time rb %f\n", icl_profile_event(rb, ICL_STARTED, ICL_FINISHED, ICL_SEC));
	
		icl_release_events(5, wb1, wb2, wb_all, rk, rb);
		icl_release_buffers(3, buf_input1, buf_input2, buf_output);
		icl_release_kernel(kernel);
	}
#ifndef INSIEME
	icl_restart_timer(time1);
#endif
	icl_release_devices();
#ifndef INSIEME
	printf("TIME for releasing the devices: %f\n", icl_stop_timer(time1));
	icl_release_timer(time1);
#endif
	
	// CHECK for output
	printf("======================\n= Vector Mul Done\n");
	unsigned int check = 1;
	for(unsigned int i = 0; i < size; ++i) {
		if(output[i] != i*size) {
			check = 0;
			printf("= fail at %d, expected %d / actual %d", i, i*3/2, output[i]);
			break;
		}
	}
	printf("= result check: %s\n======================\n", check ? "OK" : "FAIL");
	free(input1);
	free(input2);
	free(output);
}