__kernel void memset_uint4(__global int *mem, const int size, __private int val) { \n\
int tid = get_local_id(0); \n\
int bx = (get_group_id(1)) * (get_num_groups(0)) + get_group_id(0); \n\
int i = tid + (bx) * (get_local_size(0)); \n\
//debug \n\
//if (i == 0) { printf(\"memset size = %i value = %i buffer %i \\n\",size,val,mem[0]); } \n\
if (i < size ) { mem[i]=val; } \n\
}" };
Esempio n. 2
/// \fn _copyKernel
/// \brief generate a copy kernel program
compute::program _copyKernel(const compute::context& context)
    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
        __kernel void copy_kernel(__global const float *src, __global float *dst)
            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
            uint width = get_num_groups(0) * TILE_DIM;
            for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS)
                dst[(y+i)*width +x] = src[(y+i)*width + x];
Esempio n. 3
void step_bodies(
		struct Body * bodies,
		struct Pair * pairs,
		unsigned int * map,
		float dt,
		unsigned int num_bodies, // in
		float * velocity_ratio, // in/out
		float * mass_center, // in
		float mass, // in
		unsigned int * number_escaped // out
	/* work group */
	int local_block = num_bodies / get_num_groups(0);

	unsigned int i_group0 = get_group_id(0) * local_block;
	unsigned int i_group1 = i_group0 + local_block;

	if(get_group_id(0) == (get_num_groups(0) - 1)) i_group1 = num_bodies;

	/* work item */
	int block = (i_group1 - i_group0) / get_local_size(0);

	unsigned int i_local0 = i_group0 + get_local_id(0) * block;
	unsigned int i_local1 = i_local0 + block;

	if(get_local_id(0) == (get_local_size(0) - 1)) i_local1 = i_group1;

	   printf("local_block = %i\n", local_block);
	   printf("block = %i\n", block);
	   printf("i_local0 = %i\n", i_local0);
	   printf("i_local1 = %i\n", i_local1);
	/* copy data for work group */
	//__local struct Pair local_pairs[NUM_PAIRS];
	//__local struct BodyMap local_bodymaps[NUM_BODIES / NUM_GROUPS];

	//event_t e0 = async_work_group_copy((__local char *)local_pairs, (char *)pairs, NUM_PAIRS * sizeof(struct Pair), 0);
	//wait_group_events(1, &e0);

	//event_t e1 = async_work_group_copy((__local char *)local_bodymaps, (char *)(bodymaps + i_group0), (i_group1 - i_group0) * sizeof(struct BodyMap), 0);
	//wait_group_events(1, &e1);

	/* */
	float f[3];

	//__local struct BodyMap * pbm = 0;
	//struct BodyMap * pbm = 0;

	Body * pb = 0;

	for(unsigned int b = i_local0; b < i_local1; b++)
		//pbm = local_bodymaps + b;
		//pbm = bodymaps + b;

		pb = bodies + b;

		if(pb->alive == 0)
			//puts("body dead");

		f[0] = 0;
		f[1] = 0;
		f[2] = 0;

		for(unsigned int i = 0; i < num_bodies; i++)
			if(b == i) continue;

			//__local struct Pair * pp = &local_pairs[pbm->pair[p]];
			Pair * pp = pairs + map[b * num_bodies + i];

			if(pp->_M_alive == 0) continue;

			if(pp->b0 == b)
				f[0] -= pp->u[0] * pp->f;
				f[1] -= pp->u[1] * pp->f;
				f[2] -= pp->u[2] * pp->f;
			else if(pp->b1 == b)
				f[0] += pp->u[0] * pp->f;
				f[1] += pp->u[1] * pp->f;
				f[2] += pp->u[2] * pp->f;

		float dv[3];

			dv[0] = dt * f[0] / pb->mass;
			dv[1] = dt * f[1] / pb->mass;
			dv[2] = dt * f[2] / pb->mass;
			dv[0] = dt * pb->f[0] / pb->mass;
			dv[1] = dt * pb->f[1] / pb->mass;
			dv[2] = dt * pb->f[2] / pb->mass;

				(!feq(pb->f[0], f[0])) ||
				(!feq(pb->f[1], f[1])) ||
				(!feq(pb->f[2], f[2]))


		// reset accumulating force
		pb->f[0] = 0;
		pb->f[1] = 0;
		pb->f[2] = 0;

		float e = 0.01;

		float rat[3];
		rat[0] = fabs(dv[0] / pb->v[0]);
		rat[1] = fabs(dv[1] / pb->v[1]);
		rat[2] = fabs(dv[2] / pb->v[2]);

		// atomic
		if(std::isfinite(rat[0])) if(rat[0] > velocity_ratio[0]) velocity_ratio[0] = rat[0];
		if(std::isfinite(rat[1])) if(rat[1] > velocity_ratio[1]) velocity_ratio[1] = rat[1];
		if(std::isfinite(rat[2])) if(rat[2] > velocity_ratio[2]) velocity_ratio[2] = rat[2];

					((std::isfinite(rat[0])) && (rat[0] > e)) ||
					((std::isfinite(rat[1])) && (rat[1] > e)) ||
					((std::isfinite(rat[2])) && (rat[2] > e))
				printf("% 12f % 12f % 12f\n",

		pb->v[0] += dv[0];
		pb->v[1] += dv[1];
		pb->v[2] += dv[2];

		pb->x[0] += dt * pb->v[0];
		pb->x[1] += dt * pb->v[1];
		pb->x[2] += dt * pb->v[2];

		// distance from mass center
		float r[3];
		r[0] = pb->x[0] - mass_center[0];
		r[1] = pb->x[1] - mass_center[1];
		r[2] = pb->x[2] - mass_center[2];

		float d = sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);

		float escape_speed2 = 2.0 * 6.67e-11 * mass / d;

		float s2 = pb->v[0]*pb->v[0] + pb->v[1]*pb->v[1] + pb->v[2]*pb->v[2];

		// dot product of velocity and displacement vector
		float dot = pb->v[0] * r[0] + pb->v[1] * r[1] + pb->v[2] * r[2];

		if(s2 > (escape_speed2)) // speed exceeds escape speed
			if(dot > 0.0) // parallel componenet points away from mass_center
				// atomic