int main(void)
{
	unsigned k,i,j;
	unsigned *dst, *mst;
	unsigned tran;
	e_dma_desc_t dma_desc;
	tran = 128;

	dst = (unsigned *)0x2000;
	mst = (unsigned *)0x4000;

	e_reg_write(E_REG_DMA0CONFIG, 0);
	e_reg_write(E_REG_DMA1CONFIG, 0);

	// Initialize the buffer in receiver core
	for (i=0; i<tran*5; i++)
	{ 
		dst[i]  = 0x00000000;
	}
	
	mst[0] = 0x00000000;
	
	// Initialize the auto dma register in receiver core
	e_reg_write(E_REG_DMA0AUTODMA0, 0x00000000);

	// Prepare for the descriptor slave dma 
	
	e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_WORD), 0x0000,
	0x0000, 0x0004,
	0x0400, 0x0001,
	0x0000, 0x0000,
	0x0000,(void *)dst, &dma_desc);
	
	// Wait for the signal to start dma transfering
	
	while(*mst == 0x00000000){};
	
	e_dma_start(&dma_desc, E_DMA_0);
	
	e_dma_wait(E_DMA_0);
	
	return 0;
}
void e_dma_set_desc(
		e_dma_id_t chan,
		unsigned config,     e_dma_desc_t *next_desc,
		unsigned strd_i_src, unsigned strd_i_dst,
		unsigned count_i,    unsigned count_o,
		unsigned strd_o_src, unsigned strd_o_dst,
		void     *addr_src,  void *addr_dst,
		e_dma_desc_t *desc)
{
	e_dma_wait(chan);
	desc->config       = (((unsigned) next_desc) << 16) | config;
	desc->inner_stride = (strd_i_dst             << 16) | strd_i_src;
	desc->count        = (count_o                << 16) | count_i;
	desc->outer_stride = (strd_o_dst             << 16) | strd_o_src;
	desc->src_addr     = addr_src;
	desc->dst_addr     = addr_dst;

	return;
}
int main(void)
{
	unsigned k,i,j;
	e_dma_desc_t dma_desc[4];
	unsigned *dst, *src, *dst1, *src1, *dst2, *src2;
	unsigned tran;
	unsigned tran1;
	unsigned index[3];
	unsigned neighbour_core;
	unsigned *n_row, *n_col, *p;
	unsigned *mailbox, *mailbox1, *mailbox2, *mailbox3;
	n_row = (unsigned *)0x00006400;
	n_col = (unsigned *)0x00006404;
	tran = 128;
	p = 0x0000;
	
	// Get the core id of neighbour core
	e_neighbor_id(E_NEXT_CORE, E_ROW_WRAP, n_row, n_col);

	neighbour_core = (unsigned) e_get_global_address(*n_row, *n_col, p);

	// Define the mailbox
	mailbox = (unsigned *)0x6000;
	mailbox1 = (unsigned *)0x6100;
	mailbox2 = (unsigned *)0x6200;
	mailbox3 = (unsigned *)0x6300;

	// Initialize the buffer address for dma chain test
	src = (int *)0x2000;
	dst = (int *)(neighbour_core + (unsigned)0x2000);
	src1 = (int *)0x2300;
	dst1 = (int *)(neighbour_core + (unsigned)0x2500);
	src2 = (int *)0x2600;
	dst2 = (int *)(neighbour_core + (unsigned)0x2a00);

	// Test for word size
 	// Initialize the source and destination buffer
	for (i=0; i<tran; i++)
	{ 
		src[i]  = 0xaaaaaaaa;
		src1[i] = 0xbbbbbbbb;
		src2[i] = 0xcccccccc;
	}

	for (i=0; i<tran*6; i++)
	{
		dst[i] = 0x00000000;
	}
	

	// Prepare for the descriptor for 2d dma 
	
	e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_MASTER|E_DMA_WORD), 0x0000,
	0x0004, 0x0004,
	0x0080, 0x0003,
	0x0104 , 0x0304,
	(void *)src,(void *)dst, &dma_desc[0]);
	
	// Start transaction
	e_dma_start(&dma_desc[0], E_DMA_0);
	
	// Wait
	e_dma_wait(E_DMA_0);		
	
	// Check the destination buffer value
	index[0] = checkbuffer(dst,  (unsigned)0xaaaaaaaa, tran);
	index[1] = checkbuffer(dst1, (unsigned)0xbbbbbbbb, tran);
	index[2] = checkbuffer(dst2, (unsigned)0xcccccccc, tran);

	if((index[0]|index[1]|index[2]) == 0)
	{
		mailbox[0] = 0xffffffff;
	}else
	{
		mailbox[0] = 0x00000000;
	}

	// Test for doubleword size
 	// Initialize the source and destination buffer
	for (i=0; i<tran; i++)
	{ 
		src[i]  = 0xaaaaaaaa;
		src1[i] = 0xbbbbbbbb;
		src2[i] = 0xcccccccc;
	}

	for (i=0; i<tran*6; i++)
	{
		dst[i] = 0x00000000;
	}
	
	

	// Prepare for the descriptor for 2d dma 
	
	e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000,
	0x0008, 0x0008,
	0x0040, 0x0003,
	0x0108 , 0x0308,
	(void *)src,(void *)dst, &dma_desc[0]);
	
	// Start transaction
	e_dma_start(&dma_desc[0], E_DMA_0);
	
	// Wait 
	e_dma_wait(E_DMA_0);	

	// Check the destination buffer value
	index[0] = checkbuffer(dst,  (unsigned)0xaaaaaaaa, tran);
	index[1] = checkbuffer(dst1, (unsigned)0xbbbbbbbb, tran);
	index[2] = checkbuffer(dst2, (unsigned)0xcccccccc, tran);

	if((index[0]|index[1]|index[2]) == 0)
	{
		mailbox1[0] = 0xffffffff;
	}else
	{
		mailbox1[0] = 0x00000000;
	}

	// Test for half size
 	// Initialize the source and destination buffer
	for (i=0; i<tran; i++)
	{ 
		src[i]  = 0xaaaaaaaa;
		src1[i] = 0xbbbbbbbb;
		src2[i] = 0xcccccccc;
	}

	for (i=0; i<tran*6; i++)
	{
		dst[i] = 0x00000000;
	}
	
	

	// Prepare for the descriptor for 2d dma 
	
	e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_MASTER|E_DMA_HWORD), 0x0000,
	0x0002, 0x0002,
	0x0100, 0x0003,
	0x0102 , 0x0302,
	(void *)src,(void *)dst, &dma_desc[0]);
	
	// Start transaction
	e_dma_start(&dma_desc[0], E_DMA_0);
	
	// Wait 
	e_dma_wait(E_DMA_0);		

	// Check the destination buffer value
	index[0] = checkbuffer(dst,  (unsigned)0xaaaaaaaa, tran);
	index[1] = checkbuffer(dst1, (unsigned)0xbbbbbbbb, tran);
	index[2] = checkbuffer(dst2, (unsigned)0xcccccccc, tran);

	if((index[0]|index[1]|index[2]) == 0)
	{
		mailbox2[0] = 0xffffffff;
	}else
	{
		mailbox2[0] = 0x00000000;
	}

	// Test for byte size
 	// Initialize the source and destination buffer
	for (i=0; i<tran; i++)
	{ 
		src[i]  = 0xaaaaaaaa;
		src1[i] = 0xbbbbbbbb;
		src2[i] = 0xcccccccc;
	}

	for (i=0; i<tran*6; i++)
	{
		dst[i] = 0x00000000;
	}
	

	// Prepare for the descriptor for 2d dma 
	
	e_dma_set_desc(E_DMA_0,(E_DMA_ENABLE|E_DMA_MASTER|E_DMA_BYTE), 0x0000,
	0x0001, 0x0001,
	0x0200, 0x0003,
	0x0101 , 0x0301,
	(void *)src,(void *)dst, &dma_desc[0]);
	
	// Start transaction
	e_dma_start(&dma_desc[0], E_DMA_0);
	
	// Wait 
	e_dma_wait(E_DMA_0);	
	
	// Check the destination buffer value
	index[0] = checkbuffer(dst,  (unsigned)0xaaaaaaaa, tran);
	index[1] = checkbuffer(dst1, (unsigned)0xbbbbbbbb, tran);
	index[2] = checkbuffer(dst2, (unsigned)0xcccccccc, tran);

	if((index[0]|index[1]|index[2]) == 0)
	{
		mailbox3[0] = 0xffffffff;
	}else
	{
		mailbox3[0] = 0x00000000;
	}
	return 0;
}
Exemple #4
0
int main()
{
  msg_t *m = (msg_t *) 0x4000;
  uint64_t d64;
  uint32_t d32;
  uint16_t d16;
  uint8_t  d8;
  unsigned int dt;
  void *da, *dua, *dc;
  for (unsigned int row = 0; row < E_ROWS; row++) {
    for (unsigned int col = 0; col < E_COLS; col++) {
      if (row == 0 && col == 0) continue;
      unsigned int tcore = row * E_COLS + col;

      da = e_get_global_address(row, col, (void *) MEM_ALINEADA);
      dua = e_get_global_address(row, col, (void *) MEM_NO_ALINEADA);
      dc = e_get_global_address(row, col, (void *) MEM_A_CABALLO);

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d64, da, sizeof(d64));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].t64 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d32, da, sizeof(d32));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].t32 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d16, da, sizeof(d16));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].t16 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d8, da, sizeof(d8));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].t8 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d64, dua, sizeof(d64));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].ua64 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d32, dua, sizeof(d32));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].ua32 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d16, dua, sizeof(d16));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].ua16 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d8, dua, sizeof(d8));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].ua8 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d64, dua, sizeof(d64));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].c64 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d32, dc, sizeof(d32));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].c32 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d16, dc, sizeof(d16));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].c16 = (dt - DUMMY_WAIT) / (double) VECES;

      e_dma_wait(E_DMA_1);
      e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX);
      e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
      for (int i = 0; i < VECES; i++)
        e_dma_copy(&d8, dc, sizeof(d8));
      e_dma_wait(E_DMA_1);
      e_wait(E_CTIMER_1, DUMMY_WAIT);
      e_ctimer_stop(E_CTIMER_0);
      dt = E_CTIMER_MAX - e_ctimer_get(E_CTIMER_0);
      m->ticks[tcore].c8 = (dt - DUMMY_WAIT) / (double) VECES;
    }
  }
  m->finalizado = E_TRUE;

  return 0;
}
Exemple #5
0
__kernel void
my_thread( void* p) {

	my_args_t* pargs = (my_args_t*)p;

	int N = pargs->N, s = pargs->s, d = pargs->d; 
	float *ga = pargs->ga, *gb = pargs->gb, *gc = pargs->gc;
	int n = N/d;

	int myrank_2d, mycoords[2];
	int dims[2] = {d, d};
	int periods[2] = {1, 1};

	MPI_Status status;
	MPI_Init(0,MPI_BUF_SIZE);

	MPI_Comm comm = MPI_COMM_THREAD;
	MPI_Comm comm_2d;
	MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d);
	MPI_Comm_rank(comm_2d, &myrank_2d);
	MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords);
	// Compute ranks of the up and left shifts
	int uprank, downrank, leftrank, rightrank;
	MPI_Cart_shift(comm_2d, 0, 1, &leftrank, &rightrank);
	MPI_Cart_shift(comm_2d, 1, 1, &uprank, &downrank);

	int x = mycoords[0];
	int y = mycoords[1];
	// this removes initial skew shift by reading in directly
	int skew = (x+y) % d;

	void* memfree = coprthr_tls_sbrk(0);
	float* a = (float*)coprthr_tls_sbrk(n*n*sizeof(float));
	float* b = (float*)coprthr_tls_sbrk(n*n*sizeof(float));
	float* c = (float*)coprthr_tls_sbrk(n*n*sizeof(float));

	e_dma_desc_t dma_c_read, dma_c_write, dma_a_read, dma_b_read;

#define DWORD_WRITE(desc,w,h,W,src,dst) \
	e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \
	0x0008, 0x0008, \
	w/2, h, \
	8, 4*(W-w+2), \
	(void*)src, (void*)dst, &desc)
#define DWORD_READ(desc,w,h,W,src,dst) \
	e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \
	0x0008, 0x0008, \
	w/2, h, \
	4*(W-w+2), 8, \
	(void*)src, (void*)dst, &desc)

int loop;
for(loop=0;loop<LOOP1;loop++) {

	int i,j,k,l;
	for (i=0; i<s; i++) {
		for (j=0; j<s; j++) {
			float* rgc = gc + ((i*N + y*n)*s + j)*N + x*n;
			DWORD_WRITE(dma_c_write,n,n,s*N,c,rgc);
			DWORD_READ(dma_c_read,n,n,s*N,rgc,c);
			// read C
			e_dma_start(&dma_c_read, E_DMA_0);
			e_dma_wait(E_DMA_0);
			for (k=0; k<s; k++) {
				float* rga = ga + ((i*N + y*n)*s + k)*N + skew*n;
				float* rgb = gb + ((k*N + skew*n)*s + j)*N + x*n;
				// read A and B
				DWORD_READ(dma_b_read,n,n,s*N,rgb,b);
				DWORD_READ(dma_a_read,n,n,s*N,rga,a);
				e_dma_start(&dma_b_read, E_DMA_0);
				e_dma_wait(E_DMA_0);
				e_dma_start(&dma_a_read, E_DMA_0);
				e_dma_wait(E_DMA_0);
				// transpose B
				int ji, ii;
				for (ji=0; ji<n-1; ji++) {
					for(ii=ji+1; ii<n; ii++) {
						int tmp = b[ji*n+ii];
						b[ji*n+ii] = b[ii*n+ji];
						b[ii*n+ji] = tmp;
					}
				}
				int loop;
				for (loop=0;loop<LOOP3;loop++) {
				// Get into the main computation loop
				for (l=1; l<d; l++) {
					int loop;
					for(loop=0;loop<LOOP2;loop++)
					MatrixMultiply(n, a, b, c);
					// Shift matrix a left by one and shift matrix b up by one
					MPI_Sendrecv_replace(a, n*n, MPI_FLOAT, leftrank, 1, rightrank, 1, comm_2d, &status);
					MPI_Sendrecv_replace(b, n*n, MPI_FLOAT, uprank, 1, downrank, 1, comm_2d, &status);
				}
				MatrixMultiply(n, a, b, c);
			} // end LOOP3
			}
			// write C
			e_dma_start(&dma_c_write, E_DMA_1);
			e_dma_wait(E_DMA_1);
		}
	}
} // end LOOP1

	coprthr_tls_brk(memfree);

	MPI_Finalize();

}