void __entry stencil_thread( void* p ) { my_args_t* pargs = (my_args_t*)p; int i,j; int NI = pargs->ni; int NJ = pargs->nj; int di = pargs->di; int dj = pargs->dj; int niter = pargs->niter; float* A = pargs->A; float* B = pargs->B; float w0 = pargs->w0; float w1 = pargs->w1; float w2 = pargs->w2; float w3 = pargs->w3; float w4 = pargs->w4; int myrank_2d, mycoords[2]; int dims[2] = {di, dj}; int periods[2] = {1, 1}; // Periodic communication but ignoring edge copy where irrelvant MPI_Status status; MPI_Init(0,MPI_BUF_SIZE); MPI_Comm comm = MPI_COMM_THREAD; MPI_Comm comm_2d; MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d); MPI_Comm_rank(comm_2d, &myrank_2d); MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords); int x = mycoords[0]; int y = mycoords[1]; // ranks of neighbors int north, south, west, east; MPI_Cart_shift(comm_2d, 0, 1, &west, &east); MPI_Cart_shift(comm_2d, 1, 1, &north, &south); // local stencil sizes with padding int ni = (NI-2) / di + 2; int nj = (NJ-2) / dj + 2; // Load the initial values void* memfree = coprthr_tls_sbrk(0); float* a = (float*)coprthr_tls_sbrk(ni*nj*sizeof(float)); float* b = (float*)coprthr_tls_sbrk(ni*nj*sizeof(float)); float* nsbuf = (float*)coprthr_tls_sbrk(ni*sizeof(float)); float* webuf = (float*)coprthr_tls_sbrk((nj-2)*sizeof(float)); long long* srcadr; long long* dstadr; long long* nsend = (long long*)(nsbuf + ni); // Copy initial conditions (2D DMA would be better) for (j=0; j<nj; j++) e_dma_copy(a+j*ni, A + (y*(ni-2)+j)*NI+x*(nj-2), ni*sizeof(float)); // Initial conditions // if(y==0) for (i=0; i<ni-2; i++) a[i] = -2.0f; // if(y==dj) for (i=0; i<ni-2; i++) a[(nj-1)*ni+i] = 1.0f; // if(x==di) for (j=0; j<nj-2; j++) a[(j+2)*ni-1] = -1.0f; // if(x==0) for (j=0; j<nj-2; j++) a[(j+1)*ni] = 2.0f; // Copy "a" into "b" (only need fixed borders would be better) for (i=0; i<ni*nj; i++) b[i] = a[i]; while (niter--) { /* for (j=1; j<nj-1; j++) { for (i=1; i<ni-1; i++) { b[j*ni+i] = w0*a[j*ni+i-1] + w1*a[j*ni+i] + w2*a[j*ni+i+1] + w3*a[j*ni+i-ni] + w4*a[j*ni+i+ni]; } }*/ for (j=0; j<nj-2; j+=4) { float a14 = a[(j+1)*ni+0]; float a15 = a[(j+1)*ni+1]; float a24 = a[(j+2)*ni+0]; float a25 = a[(j+2)*ni+1]; float a34 = a[(j+3)*ni+0]; float a35 = a[(j+3)*ni+1]; float a44 = a[(j+4)*ni+0]; float a45 = a[(j+4)*ni+1]; for (i=0; i<ni-2; i+=4) { float a01 = a[(j+0)*ni+i+1]; float a02 = a[(j+0)*ni+i+2]; float a03 = a[(j+0)*ni+i+3]; float a04 = a[(j+0)*ni+i+4]; float a10 = a14; float a11 = a15; float a12 = a[(j+1)*ni+i+2]; float a13 = a[(j+1)*ni+i+3]; a14 = a[(j+1)*ni+i+4]; a15 = a[(j+1)*ni+i+5]; float a20 = a24; float a21 = a25; float a22 = a[(j+2)*ni+i+2]; float a23 = a[(j+2)*ni+i+3]; a24 = a[(j+2)*ni+i+4]; a25 = a[(j+2)*ni+i+5]; float a30 = a34; float a31 = a35; float a32 = a[(j+3)*ni+i+2]; float a33 = a[(j+3)*ni+i+3]; a34 = a[(j+3)*ni+i+4]; a35 = a[(j+3)*ni+i+5]; float a40 = a44; float a41 = a45; float a42 = a[(j+4)*ni+i+2]; float a43 = a[(j+4)*ni+i+3]; a44 = a[(j+4)*ni+i+4]; a45 = a[(j+4)*ni+i+5]; float a51 = a[(j+5)*ni+i+1]; float a52 = a[(j+5)*ni+i+2]; float a53 = a[(j+5)*ni+i+3]; float a54 = a[(j+5)*ni+i+4]; b[(j+1)*ni+i+1] = fma(w4,a21,fma(w3,a01,fma(w2,a12,fma(w1,a11,w0*a10)))); b[(j+1)*ni+i+2] = fma(w4,a22,fma(w3,a02,fma(w2,a13,fma(w1,a12,w0*a11)))); b[(j+1)*ni+i+3] = fma(w4,a23,fma(w3,a03,fma(w2,a14,fma(w1,a13,w0*a12)))); b[(j+1)*ni+i+4] = fma(w4,a24,fma(w3,a04,fma(w2,a15,fma(w1,a14,w0*a13)))); b[(j+2)*ni+i+1] = fma(w4,a31,fma(w3,a11,fma(w2,a22,fma(w1,a21,w0*a20)))); b[(j+2)*ni+i+2] = fma(w4,a32,fma(w3,a12,fma(w2,a23,fma(w1,a22,w0*a21)))); b[(j+2)*ni+i+3] = fma(w4,a33,fma(w3,a13,fma(w2,a24,fma(w1,a23,w0*a22)))); b[(j+2)*ni+i+4] = fma(w4,a34,fma(w3,a14,fma(w2,a25,fma(w1,a24,w0*a23)))); b[(j+3)*ni+i+1] = fma(w4,a41,fma(w3,a21,fma(w2,a32,fma(w1,a31,w0*a30)))); b[(j+3)*ni+i+2] = fma(w4,a42,fma(w3,a22,fma(w2,a33,fma(w1,a32,w0*a31)))); b[(j+3)*ni+i+3] = fma(w4,a43,fma(w3,a23,fma(w2,a34,fma(w1,a33,w0*a32)))); b[(j+3)*ni+i+4] = fma(w4,a44,fma(w3,a24,fma(w2,a35,fma(w1,a34,w0*a33)))); b[(j+4)*ni+i+1] = fma(w4,a51,fma(w3,a31,fma(w2,a42,fma(w1,a41,w0*a40)))); b[(j+4)*ni+i+2] = fma(w4,a52,fma(w3,a32,fma(w2,a43,fma(w1,a42,w0*a41)))); b[(j+4)*ni+i+3] = fma(w4,a53,fma(w3,a33,fma(w2,a44,fma(w1,a43,w0*a42)))); b[(j+4)*ni+i+4] = fma(w4,a54,fma(w3,a34,fma(w2,a45,fma(w1,a44,w0*a43)))); } } // north/south dstadr = (long long*)nsbuf; srcadr = (long long*)(b+ni); while (dstadr != nsend) *dstadr++ = *srcadr++; // second row MPI_Sendrecv_replace(nsbuf, ni, MPI_FLOAT, north, 1, south, 1, comm, &status); if (y!=dj-1) { dstadr = (long long*)(b+(nj-1)*ni); srcadr = (long long*)nsbuf; while (srcadr != nsend) *dstadr++ = *srcadr++; // last row } dstadr = (long long*)nsbuf; srcadr = (long long*)(b+(nj-2)*ni); while (dstadr != nsend) *dstadr++ = *srcadr++; // second to last row MPI_Sendrecv_replace(nsbuf, ni, MPI_FLOAT, south, 1, north, 1, comm, &status); if (y) { dstadr = (long long*)b; srcadr = (long long*)nsbuf; while (srcadr != nsend) *dstadr++ = *srcadr++; // first row } // west/east for (j=0; j<nj-2; j++) webuf[j] = b[(j+1)*ni+1]; // second column MPI_Sendrecv_replace(webuf, nj-2, MPI_FLOAT, west, 1, east, 1, comm, &status); if (x!=di-1) for (j=0; j<nj-2; j++) b[(j+2)*ni-1] = webuf[j]; // last column for (j=0; j<nj-2; j++) webuf[j] = b[(j+2)*ni-2]; // second to last column MPI_Sendrecv_replace(webuf, nj-2, MPI_FLOAT, east, 1, west, 1, comm, &status); if (x) for (j=0; j<nj-2; j++) b[(j+1)*ni] = webuf[j]; // first column float* tmp = b; b = a; a = tmp; } // Copy internal results for (j=1; j<nj-1; j++) e_dma_copy(B + (y*(ni-2)+j)*NI+x*(nj-2)+1, a+j*ni+1, (ni-2)*sizeof(float)); coprthr_tls_brk(memfree); MPI_Finalize(); }
__kernel void dla_thread( void* p ) { my_args_t* pargs = (my_args_t*)p; int baseColor; int i, n, ix, iy, ixnew, iynew; int found; int R = 10; int R2 = 5; int seq; int row1[3], row2[3], row3[3]; int rank; int x_half = pargs->fbinfo.xres_virtual / 2; int y_half = pargs->fbinfo.yres_virtual / 2; float angle, mySin, myCos, inv_rand_max = 1.0f / (float) 32767; MPI_Status status; MPI_Init(0,MPI_BUF_SIZE); MPI_Comm comm = MPI_COMM_THREAD; MPI_Comm_rank(comm, &rank); baseColor = 0x00ffffff; void* memfree = coprthr_tls_sbrk(0); fast_srand(getpid()); seq = 1; if(rank == 0) { e_dma_copy((char *) pargs->fbinfo.smem_start + (y_half * pargs->fbinfo.line_length) + ((x_half + pargs->offset) * BPP), (char *) &baseColor, 1 * BPP); } for(n = 0; n < pargs->n; n++) { angle = ((2.0f * ((float) fastrand()) * inv_rand_max) - 1.0f) * 3.14159265f; if(angle < 0.0f) { mySin = 1.275323954f * angle + 0.405284735f * angle * angle; if(mySin < 0) mySin = 0.225f * (mySin * -mySin - mySin) + mySin; else mySin = 0.225f * (mySin * mySin - mySin) + mySin; } else { mySin = 1.275323954f * angle - 0.405284735f * angle * angle; if(mySin < 0) mySin = 0.225f * (mySin * -mySin - mySin) + mySin; else mySin = 0.225f * (mySin * mySin - mySin) + mySin; } angle += 1.57079632f; if(angle > 3.14159265f) angle -= 6.28318531f; if(angle < 0.0f) { myCos = 1.275323954f * angle + 0.405284735f * angle * angle; if(myCos < 0) myCos = 0.225f * (myCos * -myCos - myCos) + myCos; else myCos = 0.225f * (myCos * myCos - myCos) + myCos; } else { myCos = 1.275323954f * angle - 0.405284735f * angle * angle; if(myCos < 0) myCos = 0.225f * (myCos * -myCos - myCos) + myCos; else myCos = 0.225f * (myCos * myCos - myCos) + myCos; } ix = ((x_half + pargs->offset) + ((R2 - 2) * myCos)); iy = (y_half + ((R2 - 2) * mySin)); while(1) { ixnew = ix + (fastrand() % 3) - 1; if(ixnew > (x_half + pargs->offset) - R2 && ixnew < (x_half + pargs->offset) + R2) { ix = ixnew; } else { continue; } iynew = iy + (fastrand() % 3) - 1; if(iynew > y_half - R2 && iynew < y_half + R2) { iy = iynew; } else { continue; } found = 0; e_dma_copy(row1, (char *) pargs->fbinfo.smem_start + ((iy - 1) * pargs->fbinfo.line_length) + ((ix - 1) * BPP), 3 * BPP); e_dma_copy(row2, (char *) pargs->fbinfo.smem_start + ((iy) * pargs->fbinfo.line_length) + ((ix - 1) * BPP), 3 * BPP); e_dma_copy(row3, (char *) pargs->fbinfo.smem_start + ((iy + 1) * pargs->fbinfo.line_length) + ((ix - 1) * BPP), 3 * BPP); if(row1[0] != pargs->fbinfo.emptyPixVal || row2[0] != pargs->fbinfo.emptyPixVal || row3[0] != pargs->fbinfo.emptyPixVal) { found = 1; break; } if(row1[1] != pargs->fbinfo.emptyPixVal || row2[1] != pargs->fbinfo.emptyPixVal || row3[1] != pargs->fbinfo.emptyPixVal) { found = 1; break; } if(row1[2] != pargs->fbinfo.emptyPixVal || row2[2] != pargs->fbinfo.emptyPixVal || row3[2] != pargs->fbinfo.emptyPixVal) { found = 1; break; } } e_dma_copy((char *) pargs->fbinfo.smem_start + (iy * pargs->fbinfo.line_length) + (ix * BPP), (char *) &baseColor, 1 * BPP); if((ix - 3 <= ((x_half + pargs->offset) - R2) || (ix + 3 >= (x_half + pargs->offset) + R2) || (iy - 3 <= y_half - R2) || (iy + 3 >= y_half + R2))) { R += 4; R2 += 2; if(seq == 1) { baseColor -= (pargs->color == 'R' ? 0x00020000 : 0x00030000); } else if(seq == 2) { baseColor -= (pargs->color == 'B' ? 0x00000002 : 0x00000003); } else { baseColor -= (pargs->color == 'G' ? 0x00000200 : 0x00000300); } seq++; if(seq > 3) seq = 1; } } coprthr_tls_brk(memfree); MPI_Finalize(); return 0; }
__kernel void nbody_thread( void* p ) { my_args_t* pargs = (my_args_t*)p; int n = pargs->n; int cnt = pargs->cnt; unsigned int s_x, s_y, s_z, s_m; unsigned int page = 0; float dt = pargs->dt; float es = pargs->es; Particle *particles = pargs->p; ParticleV *state = pargs->v; int rank, size, npart, i; int left, right; MPI_Status status; MPI_Init(0,MPI_BUF_SIZE); MPI_Comm comm = MPI_COMM_THREAD; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); MPI_Cart_shift(comm, 0, 1, &left, &right); npart = n / size; void* memfree = coprthr_tls_sbrk(0); Particle* my_particles = (Particle*)coprthr_tls_sbrk(npart*sizeof(Particle)); ParticleV* my_state = (ParticleV*)coprthr_tls_sbrk(npart*sizeof(ParticleV)); Particle* sendbuf = (Particle*)coprthr_tls_sbrk(npart*sizeof(Particle)); e_dma_copy(my_particles, particles + npart*rank, npart*sizeof(Particle)); e_dma_copy(my_state, state + npart*rank, npart*sizeof(ParticleV)); unsigned int rgba_black = 0x00000000; unsigned int rgba_white = 0x00ffffff; while (cnt--) { for (i=0; i<npart; i++) sendbuf[i] = my_particles[i]; for (i=0; i<size; i++) { if (i) MPI_Sendrecv_replace(sendbuf, sizeof(Particle)/sizeof(float)*npart, MPI_FLOAT, left, 1, right, 1, comm, &status); ComputeAccel(my_particles, sendbuf, my_state, npart, es); } e_dma_copy(particles + npart*rank, my_particles, npart*sizeof(Particle)); ComputeNewPos(my_particles, my_state, npart, dt); for(i = 0; i < npart; i++){ s_x = (int) particles[i + npart*rank].x; s_y = (int) particles[i + npart*rank].y; if(s_x >= 0 && s_x < pargs->fbinfo.xres_virtual && s_y >= 0 && s_y < pargs->fbinfo.yres_virtual){ e_dma_copy((char *) pargs->fbinfo.smem_start + (s_y * pargs->fbinfo.line_length) + (s_x * BPP), (char *) &rgba_black, 1 * BPP); } s_x = (int) my_particles[i].x; s_y = (int) my_particles[i].y; if(cnt > 1 && s_x >= 0 && s_x < pargs->fbinfo.xres_virtual && s_y >= 0 && s_y < pargs->fbinfo.yres_virtual){ e_dma_copy((char *) pargs->fbinfo.smem_start + (s_y * pargs->fbinfo.line_length) + (s_x * BPP), (char *) &rgba_white, 1 * BPP); } } } coprthr_tls_brk(memfree); MPI_Finalize(); }
__kernel void my_thread( void* p) { my_args_t* pargs = (my_args_t*)p; int N = pargs->N, s = pargs->s, d = pargs->d; float *ga = pargs->ga, *gb = pargs->gb, *gc = pargs->gc; int n = N/d; int myrank_2d, mycoords[2]; int dims[2] = {d, d}; int periods[2] = {1, 1}; MPI_Status status; MPI_Init(0,MPI_BUF_SIZE); MPI_Comm comm = MPI_COMM_THREAD; MPI_Comm comm_2d; MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d); MPI_Comm_rank(comm_2d, &myrank_2d); MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords); // Compute ranks of the up and left shifts int uprank, downrank, leftrank, rightrank; MPI_Cart_shift(comm_2d, 0, 1, &leftrank, &rightrank); MPI_Cart_shift(comm_2d, 1, 1, &uprank, &downrank); int x = mycoords[0]; int y = mycoords[1]; // this removes initial skew shift by reading in directly int skew = (x+y) % d; void* memfree = coprthr_tls_sbrk(0); float* a = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); float* b = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); float* c = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); e_dma_desc_t dma_c_read, dma_c_write, dma_a_read, dma_b_read; #define DWORD_WRITE(desc,w,h,W,src,dst) \ e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \ 0x0008, 0x0008, \ w/2, h, \ 8, 4*(W-w+2), \ (void*)src, (void*)dst, &desc) #define DWORD_READ(desc,w,h,W,src,dst) \ e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \ 0x0008, 0x0008, \ w/2, h, \ 4*(W-w+2), 8, \ (void*)src, (void*)dst, &desc) int loop; for(loop=0;loop<LOOP1;loop++) { int i,j,k,l; for (i=0; i<s; i++) { for (j=0; j<s; j++) { float* rgc = gc + ((i*N + y*n)*s + j)*N + x*n; DWORD_WRITE(dma_c_write,n,n,s*N,c,rgc); DWORD_READ(dma_c_read,n,n,s*N,rgc,c); // read C e_dma_start(&dma_c_read, E_DMA_0); e_dma_wait(E_DMA_0); for (k=0; k<s; k++) { float* rga = ga + ((i*N + y*n)*s + k)*N + skew*n; float* rgb = gb + ((k*N + skew*n)*s + j)*N + x*n; // read A and B DWORD_READ(dma_b_read,n,n,s*N,rgb,b); DWORD_READ(dma_a_read,n,n,s*N,rga,a); e_dma_start(&dma_b_read, E_DMA_0); e_dma_wait(E_DMA_0); e_dma_start(&dma_a_read, E_DMA_0); e_dma_wait(E_DMA_0); // transpose B int ji, ii; for (ji=0; ji<n-1; ji++) { for(ii=ji+1; ii<n; ii++) { int tmp = b[ji*n+ii]; b[ji*n+ii] = b[ii*n+ji]; b[ii*n+ji] = tmp; } } int loop; for (loop=0;loop<LOOP3;loop++) { // Get into the main computation loop for (l=1; l<d; l++) { int loop; for(loop=0;loop<LOOP2;loop++) MatrixMultiply(n, a, b, c); // Shift matrix a left by one and shift matrix b up by one MPI_Sendrecv_replace(a, n*n, MPI_FLOAT, leftrank, 1, rightrank, 1, comm_2d, &status); MPI_Sendrecv_replace(b, n*n, MPI_FLOAT, uprank, 1, downrank, 1, comm_2d, &status); } MatrixMultiply(n, a, b, c); } // end LOOP3 } // write C e_dma_start(&dma_c_write, E_DMA_1); e_dma_wait(E_DMA_1); } } } // end LOOP1 coprthr_tls_brk(memfree); MPI_Finalize(); }