int main(void) { unsigned k,i,j; unsigned *dst, *mst; unsigned tran; e_dma_desc_t dma_desc; tran = 128; dst = (unsigned *)0x2000; mst = (unsigned *)0x4000; e_reg_write(E_REG_DMA0CONFIG, 0); e_reg_write(E_REG_DMA1CONFIG, 0); // Initialize the buffer in receiver core for (i=0; i<tran*5; i++) { dst[i] = 0x00000000; } mst[0] = 0x00000000; // Initialize the auto dma register in receiver core e_reg_write(E_REG_DMA0AUTODMA0, 0x00000000); // Prepare for the descriptor slave dma e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_WORD), 0x0000, 0x0000, 0x0004, 0x0400, 0x0001, 0x0000, 0x0000, 0x0000,(void *)dst, &dma_desc); // Wait for the signal to start dma transfering while(*mst == 0x00000000){}; e_dma_start(&dma_desc, E_DMA_0); e_dma_wait(E_DMA_0); return 0; }
void e_dma_set_desc( e_dma_id_t chan, unsigned config, e_dma_desc_t *next_desc, unsigned strd_i_src, unsigned strd_i_dst, unsigned count_i, unsigned count_o, unsigned strd_o_src, unsigned strd_o_dst, void *addr_src, void *addr_dst, e_dma_desc_t *desc) { e_dma_wait(chan); desc->config = (((unsigned) next_desc) << 16) | config; desc->inner_stride = (strd_i_dst << 16) | strd_i_src; desc->count = (count_o << 16) | count_i; desc->outer_stride = (strd_o_dst << 16) | strd_o_src; desc->src_addr = addr_src; desc->dst_addr = addr_dst; return; }
int main(void) { unsigned k,i,j; e_dma_desc_t dma_desc[4]; unsigned *dst, *src, *dst1, *src1, *dst2, *src2; unsigned tran; unsigned tran1; unsigned index[3]; unsigned neighbour_core; unsigned *n_row, *n_col, *p; unsigned *mailbox, *mailbox1, *mailbox2, *mailbox3; n_row = (unsigned *)0x00006400; n_col = (unsigned *)0x00006404; tran = 128; p = 0x0000; // Get the core id of neighbour core e_neighbor_id(E_NEXT_CORE, E_ROW_WRAP, n_row, n_col); neighbour_core = (unsigned) e_get_global_address(*n_row, *n_col, p); // Define the mailbox mailbox = (unsigned *)0x6000; mailbox1 = (unsigned *)0x6100; mailbox2 = (unsigned *)0x6200; mailbox3 = (unsigned *)0x6300; // Initialize the buffer address for dma chain test src = (int *)0x2000; dst = (int *)(neighbour_core + (unsigned)0x2000); src1 = (int *)0x2300; dst1 = (int *)(neighbour_core + (unsigned)0x2500); src2 = (int *)0x2600; dst2 = (int *)(neighbour_core + (unsigned)0x2a00); // Test for word size // Initialize the source and destination buffer for (i=0; i<tran; i++) { src[i] = 0xaaaaaaaa; src1[i] = 0xbbbbbbbb; src2[i] = 0xcccccccc; } for (i=0; i<tran*6; i++) { dst[i] = 0x00000000; } // Prepare for the descriptor for 2d dma e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_MASTER|E_DMA_WORD), 0x0000, 0x0004, 0x0004, 0x0080, 0x0003, 0x0104 , 0x0304, (void *)src,(void *)dst, &dma_desc[0]); // Start transaction e_dma_start(&dma_desc[0], E_DMA_0); // Wait e_dma_wait(E_DMA_0); // Check the destination buffer value index[0] = checkbuffer(dst, (unsigned)0xaaaaaaaa, tran); index[1] = checkbuffer(dst1, (unsigned)0xbbbbbbbb, tran); index[2] = checkbuffer(dst2, (unsigned)0xcccccccc, tran); if((index[0]|index[1]|index[2]) == 0) { mailbox[0] = 0xffffffff; }else { mailbox[0] = 0x00000000; } // Test for doubleword size // Initialize the source and destination buffer for (i=0; i<tran; i++) { src[i] = 0xaaaaaaaa; src1[i] = 0xbbbbbbbb; src2[i] = 0xcccccccc; } for (i=0; i<tran*6; i++) { dst[i] = 0x00000000; } // Prepare for the descriptor for 2d dma e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, 0x0008, 0x0008, 0x0040, 0x0003, 0x0108 , 0x0308, (void *)src,(void *)dst, &dma_desc[0]); // Start transaction e_dma_start(&dma_desc[0], E_DMA_0); // Wait e_dma_wait(E_DMA_0); // Check the destination buffer value index[0] = checkbuffer(dst, (unsigned)0xaaaaaaaa, tran); index[1] = checkbuffer(dst1, (unsigned)0xbbbbbbbb, tran); index[2] = checkbuffer(dst2, (unsigned)0xcccccccc, tran); if((index[0]|index[1]|index[2]) == 0) { mailbox1[0] = 0xffffffff; }else { mailbox1[0] = 0x00000000; } // Test for half size // Initialize the source and destination buffer for (i=0; i<tran; i++) { src[i] = 0xaaaaaaaa; src1[i] = 0xbbbbbbbb; src2[i] = 0xcccccccc; } for (i=0; i<tran*6; i++) { dst[i] = 0x00000000; } // Prepare for the descriptor for 2d dma e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_MASTER|E_DMA_HWORD), 0x0000, 0x0002, 0x0002, 0x0100, 0x0003, 0x0102 , 0x0302, (void *)src,(void *)dst, &dma_desc[0]); // Start transaction e_dma_start(&dma_desc[0], E_DMA_0); // Wait e_dma_wait(E_DMA_0); // Check the destination buffer value index[0] = checkbuffer(dst, (unsigned)0xaaaaaaaa, tran); index[1] = checkbuffer(dst1, (unsigned)0xbbbbbbbb, tran); index[2] = checkbuffer(dst2, (unsigned)0xcccccccc, tran); if((index[0]|index[1]|index[2]) == 0) { mailbox2[0] = 0xffffffff; }else { mailbox2[0] = 0x00000000; } // Test for byte size // Initialize the source and destination buffer for (i=0; i<tran; i++) { src[i] = 0xaaaaaaaa; src1[i] = 0xbbbbbbbb; src2[i] = 0xcccccccc; } for (i=0; i<tran*6; i++) { dst[i] = 0x00000000; } // Prepare for the descriptor for 2d dma e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_MASTER|E_DMA_BYTE), 0x0000, 0x0001, 0x0001, 0x0200, 0x0003, 0x0101 , 0x0301, (void *)src,(void *)dst, &dma_desc[0]); // Start transaction e_dma_start(&dma_desc[0], E_DMA_0); // Wait e_dma_wait(E_DMA_0); // Check the destination buffer value index[0] = checkbuffer(dst, (unsigned)0xaaaaaaaa, tran); index[1] = checkbuffer(dst1, (unsigned)0xbbbbbbbb, tran); index[2] = checkbuffer(dst2, (unsigned)0xcccccccc, tran); if((index[0]|index[1]|index[2]) == 0) { mailbox3[0] = 0xffffffff; }else { mailbox3[0] = 0x00000000; } return 0; }
int main() { msg_t *m = (msg_t *) 0x4000; uint64_t d64; uint32_t d32; uint16_t d16; uint8_t d8; unsigned int dt; void *da, *dua, *dc; for (unsigned int row = 0; row < E_ROWS; row++) { for (unsigned int col = 0; col < E_COLS; col++) { if (row == 0 && col == 0) continue; unsigned int tcore = row * E_COLS + col; da = e_get_global_address(row, col, (void *) MEM_ALINEADA); dua = e_get_global_address(row, col, (void *) MEM_NO_ALINEADA); dc = e_get_global_address(row, col, (void *) MEM_A_CABALLO); e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d64, da, sizeof(d64)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].t64 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d32, da, sizeof(d32)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].t32 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d16, da, sizeof(d16)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].t16 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d8, da, sizeof(d8)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].t8 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d64, dua, sizeof(d64)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].ua64 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d32, dua, sizeof(d32)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].ua32 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d16, dua, sizeof(d16)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].ua16 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d8, dua, sizeof(d8)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].ua8 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d64, dua, sizeof(d64)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].c64 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d32, dc, sizeof(d32)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].c32 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d16, dc, sizeof(d16)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].c16 = (dt - DUMMY_WAIT) / (double) VECES; e_dma_wait(E_DMA_1); e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); for (int i = 0; i < VECES; i++) e_dma_copy(&d8, dc, sizeof(d8)); e_dma_wait(E_DMA_1); e_wait(E_CTIMER_1, DUMMY_WAIT); e_ctimer_stop(E_CTIMER_0); dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0); m->ticks[tcore].c8 = (dt - DUMMY_WAIT) / (double) VECES; } } m->finalizado = E_TRUE; return 0; }
__kernel void my_thread( void* p) { my_args_t* pargs = (my_args_t*)p; int N = pargs->N, s = pargs->s, d = pargs->d; float *ga = pargs->ga, *gb = pargs->gb, *gc = pargs->gc; int n = N/d; int myrank_2d, mycoords[2]; int dims[2] = {d, d}; int periods[2] = {1, 1}; MPI_Status status; MPI_Init(0,MPI_BUF_SIZE); MPI_Comm comm = MPI_COMM_THREAD; MPI_Comm comm_2d; MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d); MPI_Comm_rank(comm_2d, &myrank_2d); MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords); // Compute ranks of the up and left shifts int uprank, downrank, leftrank, rightrank; MPI_Cart_shift(comm_2d, 0, 1, &leftrank, &rightrank); MPI_Cart_shift(comm_2d, 1, 1, &uprank, &downrank); int x = mycoords[0]; int y = mycoords[1]; // this removes initial skew shift by reading in directly int skew = (x+y) % d; void* memfree = coprthr_tls_sbrk(0); float* a = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); float* b = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); float* c = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); e_dma_desc_t dma_c_read, dma_c_write, dma_a_read, dma_b_read; #define DWORD_WRITE(desc,w,h,W,src,dst) \ e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \ 0x0008, 0x0008, \ w/2, h, \ 8, 4*(W-w+2), \ (void*)src, (void*)dst, &desc) #define DWORD_READ(desc,w,h,W,src,dst) \ e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \ 0x0008, 0x0008, \ w/2, h, \ 4*(W-w+2), 8, \ (void*)src, (void*)dst, &desc) int loop; for(loop=0;loop<LOOP1;loop++) { int i,j,k,l; for (i=0; i<s; i++) { for (j=0; j<s; j++) { float* rgc = gc + ((i*N + y*n)*s + j)*N + x*n; DWORD_WRITE(dma_c_write,n,n,s*N,c,rgc); DWORD_READ(dma_c_read,n,n,s*N,rgc,c); // read C e_dma_start(&dma_c_read, E_DMA_0); e_dma_wait(E_DMA_0); for (k=0; k<s; k++) { float* rga = ga + ((i*N + y*n)*s + k)*N + skew*n; float* rgb = gb + ((k*N + skew*n)*s + j)*N + x*n; // read A and B DWORD_READ(dma_b_read,n,n,s*N,rgb,b); DWORD_READ(dma_a_read,n,n,s*N,rga,a); e_dma_start(&dma_b_read, E_DMA_0); e_dma_wait(E_DMA_0); e_dma_start(&dma_a_read, E_DMA_0); e_dma_wait(E_DMA_0); // transpose B int ji, ii; for (ji=0; ji<n-1; ji++) { for(ii=ji+1; ii<n; ii++) { int tmp = b[ji*n+ii]; b[ji*n+ii] = b[ii*n+ji]; b[ii*n+ji] = tmp; } } int loop; for (loop=0;loop<LOOP3;loop++) { // Get into the main computation loop for (l=1; l<d; l++) { int loop; for(loop=0;loop<LOOP2;loop++) MatrixMultiply(n, a, b, c); // Shift matrix a left by one and shift matrix b up by one MPI_Sendrecv_replace(a, n*n, MPI_FLOAT, leftrank, 1, rightrank, 1, comm_2d, &status); MPI_Sendrecv_replace(b, n*n, MPI_FLOAT, uprank, 1, downrank, 1, comm_2d, &status); } MatrixMultiply(n, a, b, c); } // end LOOP3 } // write C e_dma_start(&dma_c_write, E_DMA_1); e_dma_wait(E_DMA_1); } } } // end LOOP1 coprthr_tls_brk(memfree); MPI_Finalize(); }