Пример #1
0
void __entry
stencil_thread( void* p )
{
	my_args_t* pargs = (my_args_t*)p;

	int i,j;
	int NI = pargs->ni;
	int NJ = pargs->nj;
	int di = pargs->di;
	int dj = pargs->dj;
	int niter = pargs->niter;
	float* A = pargs->A;
	float* B = pargs->B;
	float w0 = pargs->w0;
	float w1 = pargs->w1;
	float w2 = pargs->w2;
	float w3 = pargs->w3;
	float w4 = pargs->w4;

	int myrank_2d, mycoords[2];
	int dims[2] = {di, dj};
	int periods[2] = {1, 1}; // Periodic communication but ignoring edge copy where irrelvant

	MPI_Status status;
	MPI_Init(0,MPI_BUF_SIZE);

	MPI_Comm comm = MPI_COMM_THREAD;
	MPI_Comm comm_2d;
	MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d);
	MPI_Comm_rank(comm_2d, &myrank_2d);
	MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords);

	int x = mycoords[0];
	int y = mycoords[1];

	// ranks of neighbors
	int north, south, west, east;
	MPI_Cart_shift(comm_2d, 0, 1, &west, &east);
	MPI_Cart_shift(comm_2d, 1, 1, &north, &south);

	// local stencil sizes with padding
	int ni = (NI-2) / di + 2;
	int nj = (NJ-2) / dj + 2;

	// Load the initial values
	void* memfree = coprthr_tls_sbrk(0);
	float* a = (float*)coprthr_tls_sbrk(ni*nj*sizeof(float));
	float* b = (float*)coprthr_tls_sbrk(ni*nj*sizeof(float));
	float* nsbuf = (float*)coprthr_tls_sbrk(ni*sizeof(float));
	float* webuf = (float*)coprthr_tls_sbrk((nj-2)*sizeof(float));
	long long* srcadr;
	long long* dstadr;
	long long* nsend = (long long*)(nsbuf + ni);

	// Copy initial conditions (2D DMA would be better)
	for (j=0; j<nj; j++) e_dma_copy(a+j*ni, A + (y*(ni-2)+j)*NI+x*(nj-2), ni*sizeof(float));

	// Initial conditions
//	if(y==0) for (i=0; i<ni-2; i++) a[i] = -2.0f;
//	if(y==dj) for (i=0; i<ni-2; i++) a[(nj-1)*ni+i] = 1.0f;
//	if(x==di) for (j=0; j<nj-2; j++) a[(j+2)*ni-1] = -1.0f;
//	if(x==0) for (j=0; j<nj-2; j++) a[(j+1)*ni] = 2.0f;

	// Copy "a" into "b" (only need fixed borders would be better)
	for (i=0; i<ni*nj; i++) b[i] = a[i];

	while (niter--) {

/*		for (j=1; j<nj-1; j++) {
			for (i=1; i<ni-1; i++) {
				b[j*ni+i] = w0*a[j*ni+i-1] + w1*a[j*ni+i] + w2*a[j*ni+i+1] + w3*a[j*ni+i-ni] + w4*a[j*ni+i+ni];
			}
		}*/

		for (j=0; j<nj-2; j+=4)
		{
			float a14 = a[(j+1)*ni+0];
			float a15 = a[(j+1)*ni+1];
			float a24 = a[(j+2)*ni+0];
			float a25 = a[(j+2)*ni+1];
			float a34 = a[(j+3)*ni+0];
			float a35 = a[(j+3)*ni+1];
			float a44 = a[(j+4)*ni+0];
			float a45 = a[(j+4)*ni+1];
			for (i=0; i<ni-2; i+=4)
			{
				float a01 = a[(j+0)*ni+i+1];
				float a02 = a[(j+0)*ni+i+2];
				float a03 = a[(j+0)*ni+i+3];
				float a04 = a[(j+0)*ni+i+4];
				float a10 = a14;
				float a11 = a15;
				float a12 = a[(j+1)*ni+i+2];
				float a13 = a[(j+1)*ni+i+3];
				a14 = a[(j+1)*ni+i+4];
				a15 = a[(j+1)*ni+i+5];
				float a20 = a24;
				float a21 = a25;
				float a22 = a[(j+2)*ni+i+2];
				float a23 = a[(j+2)*ni+i+3];
				a24 = a[(j+2)*ni+i+4];
				a25 = a[(j+2)*ni+i+5];
				float a30 = a34;
				float a31 = a35;
				float a32 = a[(j+3)*ni+i+2];
				float a33 = a[(j+3)*ni+i+3];
				a34 = a[(j+3)*ni+i+4];
				a35 = a[(j+3)*ni+i+5];
				float a40 = a44;
				float a41 = a45;
				float a42 = a[(j+4)*ni+i+2];
				float a43 = a[(j+4)*ni+i+3];
				a44 = a[(j+4)*ni+i+4];
				a45 = a[(j+4)*ni+i+5];
				float a51 = a[(j+5)*ni+i+1];
				float a52 = a[(j+5)*ni+i+2];
				float a53 = a[(j+5)*ni+i+3];
				float a54 = a[(j+5)*ni+i+4];

				b[(j+1)*ni+i+1] = fma(w4,a21,fma(w3,a01,fma(w2,a12,fma(w1,a11,w0*a10))));
				b[(j+1)*ni+i+2] = fma(w4,a22,fma(w3,a02,fma(w2,a13,fma(w1,a12,w0*a11))));
				b[(j+1)*ni+i+3] = fma(w4,a23,fma(w3,a03,fma(w2,a14,fma(w1,a13,w0*a12))));
				b[(j+1)*ni+i+4] = fma(w4,a24,fma(w3,a04,fma(w2,a15,fma(w1,a14,w0*a13))));
				b[(j+2)*ni+i+1] = fma(w4,a31,fma(w3,a11,fma(w2,a22,fma(w1,a21,w0*a20))));
				b[(j+2)*ni+i+2] = fma(w4,a32,fma(w3,a12,fma(w2,a23,fma(w1,a22,w0*a21))));
				b[(j+2)*ni+i+3] = fma(w4,a33,fma(w3,a13,fma(w2,a24,fma(w1,a23,w0*a22))));
				b[(j+2)*ni+i+4] = fma(w4,a34,fma(w3,a14,fma(w2,a25,fma(w1,a24,w0*a23))));
				b[(j+3)*ni+i+1] = fma(w4,a41,fma(w3,a21,fma(w2,a32,fma(w1,a31,w0*a30))));
				b[(j+3)*ni+i+2] = fma(w4,a42,fma(w3,a22,fma(w2,a33,fma(w1,a32,w0*a31))));
				b[(j+3)*ni+i+3] = fma(w4,a43,fma(w3,a23,fma(w2,a34,fma(w1,a33,w0*a32))));
				b[(j+3)*ni+i+4] = fma(w4,a44,fma(w3,a24,fma(w2,a35,fma(w1,a34,w0*a33))));
				b[(j+4)*ni+i+1] = fma(w4,a51,fma(w3,a31,fma(w2,a42,fma(w1,a41,w0*a40))));
				b[(j+4)*ni+i+2] = fma(w4,a52,fma(w3,a32,fma(w2,a43,fma(w1,a42,w0*a41))));
				b[(j+4)*ni+i+3] = fma(w4,a53,fma(w3,a33,fma(w2,a44,fma(w1,a43,w0*a42))));
				b[(j+4)*ni+i+4] = fma(w4,a54,fma(w3,a34,fma(w2,a45,fma(w1,a44,w0*a43))));

			}
		}

		// north/south
		dstadr = (long long*)nsbuf;
		srcadr = (long long*)(b+ni);
		while (dstadr != nsend) *dstadr++ = *srcadr++; // second row
		MPI_Sendrecv_replace(nsbuf, ni, MPI_FLOAT, north, 1, south, 1, comm, &status);
		if (y!=dj-1) {
			dstadr = (long long*)(b+(nj-1)*ni);
			srcadr = (long long*)nsbuf;
			while (srcadr != nsend) *dstadr++ = *srcadr++; // last row
		}
		dstadr = (long long*)nsbuf;
		srcadr = (long long*)(b+(nj-2)*ni);
		while (dstadr != nsend) *dstadr++ = *srcadr++; // second to last row
		MPI_Sendrecv_replace(nsbuf, ni, MPI_FLOAT, south, 1, north, 1, comm, &status);
		if (y) {
			dstadr = (long long*)b;
			srcadr = (long long*)nsbuf;
			while (srcadr != nsend) *dstadr++ = *srcadr++; // first row
		}

		// west/east
		for (j=0; j<nj-2; j++) webuf[j] = b[(j+1)*ni+1]; // second column
		MPI_Sendrecv_replace(webuf, nj-2, MPI_FLOAT, west, 1, east, 1, comm, &status);
		if (x!=di-1) for (j=0; j<nj-2; j++) b[(j+2)*ni-1] = webuf[j]; // last column
		for (j=0; j<nj-2; j++) webuf[j] = b[(j+2)*ni-2]; // second to last column
		MPI_Sendrecv_replace(webuf, nj-2, MPI_FLOAT, east, 1, west, 1, comm, &status);
		if (x) for (j=0; j<nj-2; j++) b[(j+1)*ni] = webuf[j]; // first column

		float* tmp = b;
		b = a;
		a = tmp;
	}

	// Copy internal results
	for (j=1; j<nj-1; j++) e_dma_copy(B + (y*(ni-2)+j)*NI+x*(nj-2)+1, a+j*ni+1, (ni-2)*sizeof(float));

	coprthr_tls_brk(memfree);

	MPI_Finalize();
}
Пример #2
0
__kernel void
dla_thread( void* p )
{
    my_args_t* pargs = (my_args_t*)p;
    int baseColor;
    int i, n, ix, iy, ixnew, iynew;
    int found;
    int R = 10;
    int R2 = 5;
    int seq;
    int row1[3], row2[3], row3[3];
    int rank;
    int x_half = pargs->fbinfo.xres_virtual / 2;
    int y_half = pargs->fbinfo.yres_virtual / 2;
    float angle, mySin, myCos, inv_rand_max = 1.0f / (float) 32767;


    MPI_Status status;
    MPI_Init(0,MPI_BUF_SIZE);
    MPI_Comm comm = MPI_COMM_THREAD;
    MPI_Comm_rank(comm, &rank);

    baseColor = 0x00ffffff;

    void* memfree = coprthr_tls_sbrk(0);
    fast_srand(getpid());
    seq = 1;
    if(rank == 0) {
        e_dma_copy((char *) pargs->fbinfo.smem_start + (y_half * pargs->fbinfo.line_length) + ((x_half + pargs->offset) * BPP), (char *) &baseColor, 1 * BPP);
    }
    for(n = 0; n < pargs->n; n++) {
        angle = ((2.0f * ((float) fastrand()) * inv_rand_max) - 1.0f) * 3.14159265f;
        if(angle < 0.0f) {
            mySin = 1.275323954f * angle + 0.405284735f * angle * angle;
            if(mySin < 0)
                mySin = 0.225f * (mySin * -mySin - mySin) + mySin;
            else
                mySin = 0.225f * (mySin * mySin - mySin) + mySin;
        }
        else {
            mySin = 1.275323954f * angle - 0.405284735f * angle * angle;
            if(mySin < 0)
                mySin = 0.225f * (mySin * -mySin - mySin) + mySin;
            else
                mySin = 0.225f * (mySin * mySin - mySin) + mySin;
        }
        angle += 1.57079632f;
        if(angle > 3.14159265f)
            angle -= 6.28318531f;
        if(angle < 0.0f) {
            myCos = 1.275323954f * angle + 0.405284735f * angle * angle;
            if(myCos < 0)
                myCos = 0.225f * (myCos * -myCos - myCos) + myCos;
            else
                myCos = 0.225f * (myCos * myCos - myCos) + myCos;
        }
        else {
            myCos = 1.275323954f * angle - 0.405284735f * angle * angle;
            if(myCos < 0)
                myCos = 0.225f * (myCos * -myCos - myCos) + myCos;
            else
                myCos = 0.225f * (myCos * myCos - myCos) + myCos;
        }
        ix = ((x_half + pargs->offset) + ((R2 - 2) * myCos));
        iy = (y_half + ((R2 - 2) * mySin));

        while(1) {
            ixnew = ix + (fastrand() % 3) - 1;
            if(ixnew > (x_half + pargs->offset) - R2 && ixnew < (x_half + pargs->offset) + R2) {
                ix = ixnew;
            }
            else {
                continue;
            }
            iynew = iy + (fastrand() % 3) - 1;
            if(iynew > y_half - R2 && iynew < y_half + R2) {
                iy = iynew;
            }
            else {
                continue;
            }
            found = 0;
            e_dma_copy(row1, (char *) pargs->fbinfo.smem_start + ((iy - 1) * pargs->fbinfo.line_length) + ((ix - 1) * BPP), 3 * BPP);
            e_dma_copy(row2, (char *) pargs->fbinfo.smem_start + ((iy) * pargs->fbinfo.line_length) + ((ix - 1) * BPP), 3 * BPP);
            e_dma_copy(row3, (char *) pargs->fbinfo.smem_start + ((iy + 1) * pargs->fbinfo.line_length) + ((ix - 1) * BPP), 3 * BPP);
            if(row1[0] != pargs->fbinfo.emptyPixVal || row2[0] != pargs->fbinfo.emptyPixVal || row3[0] != pargs->fbinfo.emptyPixVal) {
                found = 1;
                break;
            }
            if(row1[1] != pargs->fbinfo.emptyPixVal || row2[1] != pargs->fbinfo.emptyPixVal || row3[1] != pargs->fbinfo.emptyPixVal) {
                found = 1;
                break;
            }
            if(row1[2] != pargs->fbinfo.emptyPixVal || row2[2] != pargs->fbinfo.emptyPixVal || row3[2] != pargs->fbinfo.emptyPixVal) {
                found = 1;
                break;
            }
        }
        e_dma_copy((char *) pargs->fbinfo.smem_start + (iy * pargs->fbinfo.line_length) + (ix * BPP), (char *) &baseColor, 1 * BPP);

        if((ix - 3 <= ((x_half + pargs->offset) - R2) || (ix + 3 >= (x_half + pargs->offset) + R2) || (iy - 3 <= y_half - R2) || (iy + 3 >= y_half + R2))) {
            R += 4;
            R2 += 2;
            if(seq == 1) {
                baseColor -= (pargs->color == 'R' ? 0x00020000 : 0x00030000);
            }
            else if(seq == 2) {
                baseColor -= (pargs->color == 'B' ? 0x00000002 : 0x00000003);
            }
            else {
                baseColor -= (pargs->color == 'G' ? 0x00000200 : 0x00000300);
            }
            seq++;
            if(seq > 3)
                seq = 1;
        }
    }

    coprthr_tls_brk(memfree);

    MPI_Finalize();
    return 0;
}
Пример #3
0
	__kernel void
nbody_thread( void* p )
{
	my_args_t* pargs = (my_args_t*)p;

	int n = pargs->n;
	int cnt = pargs->cnt;
	unsigned int s_x, s_y, s_z, s_m;
	unsigned int page = 0;
	float dt = pargs->dt;
	float es = pargs->es;
	Particle *particles = pargs->p;
	ParticleV *state = pargs->v;

	int rank, size, npart, i;
	int left, right;

	MPI_Status status;
	MPI_Init(0,MPI_BUF_SIZE);
	MPI_Comm comm = MPI_COMM_THREAD;
	MPI_Comm_rank(comm, &rank);
	MPI_Comm_size(comm, &size);
	MPI_Cart_shift(comm, 0, 1, &left, &right);

	npart = n / size;

	void* memfree = coprthr_tls_sbrk(0);
	Particle* my_particles = (Particle*)coprthr_tls_sbrk(npart*sizeof(Particle));
	ParticleV* my_state = (ParticleV*)coprthr_tls_sbrk(npart*sizeof(ParticleV));
	Particle* sendbuf = (Particle*)coprthr_tls_sbrk(npart*sizeof(Particle));

	e_dma_copy(my_particles, particles + npart*rank, npart*sizeof(Particle));
	e_dma_copy(my_state, state + npart*rank, npart*sizeof(ParticleV));
	unsigned int rgba_black = 0x00000000;
	unsigned int rgba_white = 0x00ffffff;

	while (cnt--) {

		for (i=0; i<npart; i++) sendbuf[i] = my_particles[i];

		for (i=0; i<size; i++) {
			if (i) MPI_Sendrecv_replace(sendbuf, sizeof(Particle)/sizeof(float)*npart, MPI_FLOAT, left, 1, right, 1, comm, &status);
			ComputeAccel(my_particles, sendbuf, my_state, npart, es);
		}
		e_dma_copy(particles + npart*rank, my_particles, npart*sizeof(Particle));
		ComputeNewPos(my_particles, my_state, npart, dt);

		for(i = 0; i < npart; i++){
			s_x = (int) particles[i + npart*rank].x;
			s_y = (int) particles[i + npart*rank].y;
			if(s_x >= 0 && s_x < pargs->fbinfo.xres_virtual && s_y >= 0 && s_y < pargs->fbinfo.yres_virtual){
				e_dma_copy((char *) pargs->fbinfo.smem_start + (s_y * pargs->fbinfo.line_length) + (s_x * BPP), (char *) &rgba_black, 1 * BPP);
			}
			s_x = (int) my_particles[i].x;
			s_y = (int) my_particles[i].y;
			if(cnt  > 1 && s_x >= 0 && s_x < pargs->fbinfo.xres_virtual && s_y >= 0 && s_y < pargs->fbinfo.yres_virtual){
				e_dma_copy((char *) pargs->fbinfo.smem_start + (s_y * pargs->fbinfo.line_length) + (s_x * BPP), (char *) &rgba_white, 1 * BPP);
			}
		}

	}

	coprthr_tls_brk(memfree);

	MPI_Finalize();
}
Пример #4
0
__kernel void
my_thread( void* p) {

	my_args_t* pargs = (my_args_t*)p;

	int N = pargs->N, s = pargs->s, d = pargs->d; 
	float *ga = pargs->ga, *gb = pargs->gb, *gc = pargs->gc;
	int n = N/d;

	int myrank_2d, mycoords[2];
	int dims[2] = {d, d};
	int periods[2] = {1, 1};

	MPI_Status status;
	MPI_Init(0,MPI_BUF_SIZE);

	MPI_Comm comm = MPI_COMM_THREAD;
	MPI_Comm comm_2d;
	MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d);
	MPI_Comm_rank(comm_2d, &myrank_2d);
	MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords);
	// Compute ranks of the up and left shifts
	int uprank, downrank, leftrank, rightrank;
	MPI_Cart_shift(comm_2d, 0, 1, &leftrank, &rightrank);
	MPI_Cart_shift(comm_2d, 1, 1, &uprank, &downrank);

	int x = mycoords[0];
	int y = mycoords[1];
	// this removes initial skew shift by reading in directly
	int skew = (x+y) % d;

	void* memfree = coprthr_tls_sbrk(0);
	float* a = (float*)coprthr_tls_sbrk(n*n*sizeof(float));
	float* b = (float*)coprthr_tls_sbrk(n*n*sizeof(float));
	float* c = (float*)coprthr_tls_sbrk(n*n*sizeof(float));

	e_dma_desc_t dma_c_read, dma_c_write, dma_a_read, dma_b_read;

#define DWORD_WRITE(desc,w,h,W,src,dst) \
	e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \
	0x0008, 0x0008, \
	w/2, h, \
	8, 4*(W-w+2), \
	(void*)src, (void*)dst, &desc)
#define DWORD_READ(desc,w,h,W,src,dst) \
	e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \
	0x0008, 0x0008, \
	w/2, h, \
	4*(W-w+2), 8, \
	(void*)src, (void*)dst, &desc)

int loop;
for(loop=0;loop<LOOP1;loop++) {

	int i,j,k,l;
	for (i=0; i<s; i++) {
		for (j=0; j<s; j++) {
			float* rgc = gc + ((i*N + y*n)*s + j)*N + x*n;
			DWORD_WRITE(dma_c_write,n,n,s*N,c,rgc);
			DWORD_READ(dma_c_read,n,n,s*N,rgc,c);
			// read C
			e_dma_start(&dma_c_read, E_DMA_0);
			e_dma_wait(E_DMA_0);
			for (k=0; k<s; k++) {
				float* rga = ga + ((i*N + y*n)*s + k)*N + skew*n;
				float* rgb = gb + ((k*N + skew*n)*s + j)*N + x*n;
				// read A and B
				DWORD_READ(dma_b_read,n,n,s*N,rgb,b);
				DWORD_READ(dma_a_read,n,n,s*N,rga,a);
				e_dma_start(&dma_b_read, E_DMA_0);
				e_dma_wait(E_DMA_0);
				e_dma_start(&dma_a_read, E_DMA_0);
				e_dma_wait(E_DMA_0);
				// transpose B
				int ji, ii;
				for (ji=0; ji<n-1; ji++) {
					for(ii=ji+1; ii<n; ii++) {
						int tmp = b[ji*n+ii];
						b[ji*n+ii] = b[ii*n+ji];
						b[ii*n+ji] = tmp;
					}
				}
				int loop;
				for (loop=0;loop<LOOP3;loop++) {
				// Get into the main computation loop
				for (l=1; l<d; l++) {
					int loop;
					for(loop=0;loop<LOOP2;loop++)
					MatrixMultiply(n, a, b, c);
					// Shift matrix a left by one and shift matrix b up by one
					MPI_Sendrecv_replace(a, n*n, MPI_FLOAT, leftrank, 1, rightrank, 1, comm_2d, &status);
					MPI_Sendrecv_replace(b, n*n, MPI_FLOAT, uprank, 1, downrank, 1, comm_2d, &status);
				}
				MatrixMultiply(n, a, b, c);
			} // end LOOP3
			}
			// write C
			e_dma_start(&dma_c_write, E_DMA_1);
			e_dma_wait(E_DMA_1);
		}
	}
} // end LOOP1

	coprthr_tls_brk(memfree);

	MPI_Finalize();

}