Esempio n. 1
0
int cacheGetPrime(int n)
{
    if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart))
    {
        int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4);
        return r;
    }

    // Haal op.

    uint32_t    tag, size;
    tag = mfc_tag_reserve();
    size = CACHE_PRIME_SIZE*16;

    unsigned long long  EA = setup.vPrimes + (n - n%4) * 4;

    mfc_get(&primeCacheData, EA, size, tag, 0, 0);
    mfc_write_tag_mask(1 << tag);
    mfc_read_tag_status_all();
    mfc_tag_release(tag);

    primeCacheStart = n - (n % 4);

    int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4);
    return r;
}
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp)
{
	int i = 0;
	ppu_data_t ppu_data __attribute__ ((aligned(16)));

	tag_id = mfc_tag_reserve();
	if (tag_id == MFC_TAG_INVALID){
		printf("SPU: ERROR can't allocate tag ID\n");
		return -1;
	}

	/* Obtin prin DMA structura cu pointeri, nr de frame-uri si spe_id */
	dprintf("SPU: am intrat in spu %llx %lu %llx\n",
			speid, sizeof(ppu_data_t), envp);
	mfc_get((void*)&ppu_data, argp, (uint32_t)envp, tag_id, 0, 0);
	waittag(tag_id);

	dprintf("SPU: speid:%llx got struct\n", speid);
	dprintf("SPU: speid:%llx id:%02d input:%p big_img:%p num_frms:%d\n",
			speid, ppu_data.spe_id, ppu_data.input, ppu_data.big_image,
			ppu_data.num_frames);
	speid = speid;

	/* Frame processing goes here */
	for (i = 0; i < ppu_data.num_frames; ++i) {
		process_frame(ppu_data, i);
	}

	return 0;
}
Esempio n. 3
0
void init_memcpy()
{
	int i;

	for(i=0; i < NUM_MEMCPY_TAGS; i++)
		memcpy_tag[i] = mfc_tag_reserve();

}
Esempio n. 4
0
int main(unsigned long long speid __attribute__ ((unused)), 
	 unsigned long long argp, 
	 unsigned long long envp __attribute__ ((unused))) 
{
  int i;
  unsigned int tag_id;

  /* Reserve a tag for application usage */
  if ((tag_id = mfc_tag_reserve()) == MFC_TAG_INVALID) {
    printf("ERROR: unable to reserve a tag\n");
    return 1;
  }

  /* Here is the actual DMA call */
  /* the first parameter is the address in local store to place the data */
  /* the second parameter holds the main memory address                  */
  /* the third parameter holds the number of bytes to DMA                */
  /* the fourth parameter identifies a "tag" to associate with this DMA  */
  /* (this should be a number between 0 and 31, inclusive)               */
  /* the last two parameters are only useful if you've implemented your  */
  /* own cache replacement management policy.  Otherwise set them to 0.  */

  mfc_get(&cb, argp, sizeof(cb), tag_id, 0, 0);

  /* Now, we set the "tag bit" into the correct channel on the hardware  */
  /* this is always 1 left-shifted by the tag specified with the DMA     */
  /* for whose completion you wish to wait.                              */
  mfc_write_tag_mask(1<<tag_id);

  /* Now, issue the read and wait to guarantee DMA completion before we  */
  /* continue. */
  mfc_read_tag_status_all();

  /* DMA the data from system memory to our local store buffer. */
  mfc_get(data, cb.addr, DATA_BUFFER_SIZE, tag_id, 0, 0);


  printf("Address received through control block = 0x%llx\n", cb.addr);


  /* Wait for the data array DMA to complete. */
  mfc_read_tag_status_all();

  /* Verify that the data array contains a valid fibonacci sequence.
   */
  for (i=2; i<DATA_BUFFER_ENTRIES; i++) {
    if (data[i] != data[i-1] + data[i-2]) {
      printf("ERROR: fibonacci sequence error at entry %d. Expected %d, Got %d\n",
	     i, data[i-1] + data[i-2], data[i]);
      return (1);
    }
  }

  return 0;
}
Esempio n. 5
0
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm)
{
	init_spu_abs();
	init_memcpy();

	uint tag_id = mfc_tag_reserve();
	jobtag = mfc_tag_reserve();

	memcpy_tag_ppe[0] = mfc_tag_reserve();
	memcpy_tag_ppe[1] = mfc_tag_reserve();

	// Transfer arg
	spu_mfcdma32(&arg, (unsigned int)parm, (unsigned int)sizeof(kdbuild_arg_t), tag_id, MFC_GET_CMD);
	DmaWait(tag_id);

	nsamplepoints = arg.nsamplepoints;
	nsplitaxises = arg.nsplitaxises;

	curleaf = arg.curleaf;
	curjob = 0;

	total_leaf_size = 0;

	MakeNodes();	

	DmaWaitAll();

	MakeLeaves();

	DmaWaitAll();

	spu_mfcdma32(&numleafpolys, (unsigned int)arg.numleafpolys, sizeof(int), tag_id, MFC_PUT_CMD);
	
	DmaWait(tag_id);	


	

	return 0;
}
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm)
{
  int i, j;
  int left, cnt;
  float time;
  unsigned int tag_id;
  vector float dt_v, dt_inv_mass_v;

  // Reserve a tag ID
  tag_id = mfc_tag_reserve();

  spu_writech(MFC_WrTagMask, -1);

  // Input parameter parm is a pointer to the particle parameter context.
  // Fetch the context, waiting for it to complete.
  
  spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt_v = spu_splats(ctx.dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += ctx.dt) {
    // For each block of particles
    for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) {
      // Determine the number of particles in this block.
      left = ctx.particles - i;
      cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Fetch the data - position, velocity and inverse_mass. Wait for the DMA to complete 
      // before performing computation.
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_GETB_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD);
      spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * sizeof(float), tag_id, MFC_GET_CMD);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      // Compute the step in time for the block of particles
      for (j=0; j<cnt; j++) {
	pos[j] = spu_madd(vel[j], dt_v, pos[j]);
	dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j]));
	vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]);
      }

      // Put the position and velocity data back into system memory
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD);
    }
  }
  // Wait for final DMAs to complete before terminating SPU thread.
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  return (0);
}
Esempio n. 7
0
void pull(int side){
  int avail_in = num_free_in_buffer(side);
  int avail_mm = mcb[am].data_size[side] - md[am].num_pulled[side];
  int num_pull = avail_in < avail_mm ? avail_in : avail_mm;
  num_pull = num_pull < MAX_DMA_SIZE ? num_pull : MAX_DMA_SIZE;
  int head = spu_extract(md[am].idx[side][HEAD],0);
  int avail_from_head = mcb[am].buffer_size[side] - head;
  int first_pull = num_pull < avail_from_head ? num_pull : avail_from_head;
        
  if(!first_pull)
    return;

  // pull #first_pull
  unsigned int to_ea = (unsigned int) &md[am].buffer[side][head];
  int tag = mfc_tag_reserve();
  if(tag == MFC_TAG_INVALID){
    return;
  } else {
    md[am].held_tag[side] = tag;
  }

  mfc_get((void*)to_ea,
	  mcb[am].block_addr[side],
	  first_pull * sizeof(vector signed int),
	  md[am].held_tag[side],
	  0,0);
  mcb[am].block_addr[side] += first_pull * sizeof(vector signed int);

  if(first_pull < num_pull){
    to_ea = (unsigned int) &md[am].buffer[side][0];
    int second_pull = num_pull - first_pull;

    mfc_get((void*)to_ea,
	    mcb[am].block_addr[side],
	    second_pull * sizeof(vector signed int),
	    md[am].held_tag[side],
	    0,0);
    mcb[am].block_addr[side] += second_pull * sizeof(vector signed int);
  }

  md[am].num_waiting[side] = num_pull;
}
Esempio n. 8
0
int main (uint64_t speid, uint64_t argp)
{
  DPRINTF ("+(spu)main (%lld, %lld)\n", speid, argp);

  // -- reserve DMA tag ID for this SPU ---------------------------------------
  if ((tag = mfc_tag_reserve()) == MFC_TAG_INVALID)
    as_exitf ("ERROR - can't reserve a tag\n");
  DPRINTF (" [%lld] mfc_tag_reserve() = %d\n", speid, tag);

  // -- get CBE and problem information from system memory. -------------------
  DPRINTF (" [%lld] mfc_get (0x%x, 0x%llx, %d, %lu, 0, 0)\n", speid,
	   (unsigned) &sd, argp, sizeof(sd), (int) tag);
  mfc_getb (&sd, argp, sizeof(sd), tag, 0, 0);
  DPRINTF (" [%lld] waittag (%d)\n", speid, (int) tag);
  waittag (tag);

  sd.sd_ea = argp;		// save PPE address of sd block
  sd.value = sd.ad.sol;		// save PPE address of solution vector
  sd.size = ROUND_UP_128 (sd.ad.size_in_bytes);
  sd.ad.sol = memalign (16, sd.size); // allocate LS block
  if (sd.ad.sol == NULL) {
    fprintf (stderr,
	     "%s:%d: malloc failed in SPU %d\n", __FILE__, __LINE__, sd.num);
    exit(1);
  }


#if defined(DEBUG) && (DEBUG & 16)
  printf ("spe%d: &sd=0x%x, sd.value=0x%x, sd.ad.sol=0x%x\n",
	  sd.num, &sd, sd.value, sd.ad.sol);
#endif
  // -- *TBD* -- does sd.value need to be remapped (EA?)
  // -- get value vector from system memory into new LS block -----------------
  DPRINTF (" [%lld] mfc_get (0x%x, 0x%x, %d, %lu, 0, 0)\n", speid,
	   (unsigned) sd.ad.sol, (unsigned) sd.value,
	   sd.size, tag);
  

  // -- fix pb with DMA limitation (max = 16 KB) ------------------------------
  {
    int nbytes = sd.size;
    char *addr_ls = (char *) sd.ad.sol;
    char *addr_ea = (char *) sd.value;

    do {
      int sz = (nbytes < SPU_MAX_DMA)? nbytes: SPU_MAX_DMA;

      mfc_getb (addr_ls, (uint32_t) addr_ea, sz, tag, 0, 0);
      waittag (tag);

      nbytes -= sz;
      addr_ls += sz;
      addr_ea += sz;
    } while (nbytes > 0);
  }

#if defined(DEBUG) && (DEBUG & 8)
  printf (" [%lld] as_init dump:", speid);
  printf ("   sd.num = %d", sd.num);
  printf ("   sd.ctx = %d", (int) sd.ctx);
  printf ("   sd.thr = %d\n", (int) sd.thr);
#endif

#if defined(DEBUG) && (DEBUG & 2)
  if (sd.ad.do_not_init) {
    printf ("(SPU %d: received data (do_not_init=1):\n", sd.num);
    Ad_Display(sd.ad.sol, &sd.ad, NULL);
    printf(")\n");
  }
#endif
  
  Randomize_Seed (sd.ad.seed ^ sd.num);

  // -- call the benchmark-specific solver
  Solve (&sd.ad);
  
  // -- put the solution back on main memory for the PPE to read
  as_send ();

  //  printf ("SPU main returning\n");
  return 0;
}
Esempio n. 9
0
int main(unsigned long long speid __attribute__ ((unused)),
         unsigned long long argp, 
         unsigned long long envp __attribute__ ((unused)))
{
  unsigned int tag;
  unsigned long long in_addr, out_addr;
  unsigned int i, num_chunks;
  mfc_list_element_t* dma_list_in;
  unsigned int tmp_addr;

#ifdef USE_TIMER
  uint64_t start, time_working;
    
  spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih);
  spu_clock_start();
  start = spu_clock_read();
#endif /* USE_TIMER */

  /* First, we reserve a MFC tag for use */
  tag = mfc_tag_reserve();
  if (tag == MFC_TAG_INVALID)
  {
    printf ("SPU ERROR, unable to reserve tag\n");
    return 1;
  }

  /* calculate the address of the local buffer where we can point the 
   * dma_list_in pointer to */
  tmp_addr = (unsigned int)((local_buffer_in + sizeof(float)*CHUNK_SIZE * NUM_LIST_ELEMENTS) - 
      (sizeof (mfc_list_element_t) * NUM_LIST_ELEMENTS));
  dma_list_in = (mfc_list_element_t*) (tmp_addr);

  /* issue DMA transfer to get the control block information from 
   * system memory */
  mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0);

  /* wait for the DMA get to complete */ 
  mfc_write_tag_mask (1 << tag);
  mfc_read_tag_status_all ();

  /* calculate the number of blocks (chunks) that this spe is assigned 
   * to process */
  num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE;

  /*
   * This is the main loop.  We basically goes through the num_chunks of data
   * NUM_LIST_ELEMENTS at a time. Each list element is going to move CHUNK_SIZE
   * of data into system memory. Data is moved into local store, processed, and 
   * written back to system memory NUM_LIST_ELEMENT chunks per loop iteration.      
   */
  for (i = 0; i <num_chunks; i+= NUM_LIST_ELEMENTS)
  {
    /* set the in_addr and out_addr variables, we will use these for
     * issuing DMA get and put commands */
    in_addr = control_block.in_addr + (i * CHUNK_SIZE * sizeof (float));
    out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof (float));

    /* fill the dma list with the appropriate lower 32bit effective address and size for
     * each dma list element. This dma list is used to gather the input data 
     * from system memory */
    fill_dma_list (dma_list_in, NUM_LIST_ELEMENTS, in_addr, CHUNK_SIZE * sizeof(float)); 

    /* issue a DMA get list command to gather the NUM_LIST_ELEMENT chunks of data from system memory.
     * The data will be gathered into local buffer local_buffer_in */
    mfc_getl (local_buffer_in, in_addr, dma_list_in, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0);

    /* wait for the DMA get list command to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();

    /* invoke process_data to work on the data that's just been moved into local store*/
    process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE * NUM_LIST_ELEMENTS);

    /* fill the dma list with the appropriate lower 32 bit ea and size for each
     * dma list element. This dma list is used to scatter the output data to system memory  */
    fill_dma_list (dma_list_out, NUM_LIST_ELEMENTS, out_addr, CHUNK_SIZE * sizeof(float)); 

    /* issue the DMA put list command to scatter the result from local memory to 
    * different places in system memory */
    mfc_putl (local_buffer_out, out_addr, dma_list_out, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), 
        tag, 0, 0);

    /* wait for the DMA put list to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();

  }

#ifdef USE_TIMER
  time_working = (spu_clock_read() - start);
  spu_clock_stop();
  printf ("SPE time_working = %lld\n", time_working);
#endif /* USE_TIMER */

  return 0;
}
Esempio n. 10
0
void process_data_simd (float* buf_in, float* buf_out, unsigned int size)
{
  unsigned int i;
  vector float *vbuf_in, *vbuf_out;
  vector float v1 = (vector float) {1.0f, 1.0f, 1.0f, 1.0f};
  vbuf_in = (vector float*) buf_in;
  vbuf_out = (vector float*) buf_out;

  for (i = 0; i < (size / 4); i++)
  {
   vbuf_out[i] = spu_add (vbuf_in[i], v1); 
  }
}

int main(unsigned long long speid __attribute__ ((unused)),
         unsigned long long argp, 
         unsigned long long envp __attribute__ ((unused)))
{
  unsigned int tag;
  unsigned long long in_addr, out_addr;
  int i, num_chunks;

#ifdef USE_TIMER
  uint64_t start, time_working;
  spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih);
  spu_clock_start();
  start = spu_clock_read();
#endif /* USE_TIMER */

  /* First, we reserve a MFC tag for use */
  tag = mfc_tag_reserve();
  if (tag == MFC_TAG_INVALID)
  {
    printf ("SPU ERROR, unable to reserve tag\n");
    return 1;
  }

  /* issue DMA transfer to get the control block information from 
   * system memory */
  mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0);

  /* wait for the DMA to complete */ 
  mfc_write_tag_mask (1 << tag);
  mfc_read_tag_status_all ();

  /* calculate the number of blocks (chunks) that this spe is assigned 
   * to process */
  num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE;

  /*
   * This is the main loop.  We basically goes through the num_chunks
   * and fetches one 'chunk' of data at a time, process it, and write 
   * it back to system memory until done. 
   */
  for (i = 0; i < num_chunks; i++)
  {
    /* set the in_addr and out_addr variables, we will use these for
     * issuing DMA get and put commands */
    in_addr = control_block.in_addr + (i* CHUNK_SIZE * sizeof(float));
    out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof(float));

    /* issue a DMA get command to fetch the chunk of data from system memory */
    mfc_get (local_buffer_in, in_addr, CHUNK_SIZE * sizeof(float), 
        tag, 0, 0);

    /* wait for the DMA get to complete */ 
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();


    /* invoke process_data to work on the data that's just been moved into 
     * local store*/
    process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE);

   /* issue the DMA put command to transfer result from local memory to 
    * system memory */
    mfc_put (local_buffer_out, out_addr, CHUNK_SIZE * sizeof(float), 
        tag, 0, 0);

    /* wait for the DMA put to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();
  }
#ifdef USE_TIMER
  time_working = (spu_clock_read() - start);
  spu_clock_stop();
  printf ("SPE time_working = %lld\n", time_working);
#endif /* USE_TIMER */

  return 0;
}
Esempio n. 11
0
void work(param_t param)
{
printf("SPU[%u] work()\n", param.proc);
	unsigned int inbox, offset;
    unsigned int *in = malloc_align(param.bitset_size, ALIGN_EXP);
    unsigned int *out = malloc_align(param.bitset_size, ALIGN_EXP);
    unsigned int *use = malloc_align(param.bitset_size, ALIGN_EXP);
    unsigned int *def = malloc_align(param.bitset_size, ALIGN_EXP);
    if(in == NULL || out == NULL || use == NULL || def == NULL) {
	    printf("malloc_align() failed\n");
	    exit(1);
    }
    unsigned tag_1, tag_2, tag_3, tag_4;
    unsigned int tag_id;   
    /* Reserve a tag for application usage */ 
    if ((tag_1 = mfc_tag_reserve()) == MFC_TAG_INVALID) 
    {
        printf("ERROR: unable to reserve a tag_1\n"); 
    }
    if ((tag_2 = mfc_tag_reserve()) == MFC_TAG_INVALID) 
    {
        printf("ERROR: unable to reserve a tag_2\n"); 
    }
    if ((tag_3 = mfc_tag_reserve()) == MFC_TAG_INVALID) 
    {
        printf("ERROR: unable to reserve a tag_3\n"); 
    }
    if ((tag_4 = mfc_tag_reserve()) == MFC_TAG_INVALID) 
    {
        printf("ERROR: unable to reserve a tag_4\n");
    } 

	while(1) {
		inbox = spu_read_in_mbox();

        if(inbox == UINT_MAX)
        {
            printf("SPU[%u] received exit signal.. exiting.\n", param.proc);
            return;
        }
		
		offset = param.bitset_subsets*inbox;

		mfc_get(in,  (unsigned int) (param.bs_in_addr  + offset), param.bitset_size, tag_1, 0, 0);
		mfc_get(out, (unsigned int) (param.bs_out_addr + offset), param.bitset_size, tag_2, 0, 0);
		mfc_get(use, (unsigned int) (param.bs_use_addr + offset), param.bitset_size, tag_3, 0, 0);
		mfc_get(def, (unsigned int) (param.bs_def_addr + offset), param.bitset_size, tag_4, 0, 0);
		mfc_write_tag_mask(1 << tag_1 | 1 << tag_2 | 1 << tag_3 | 1 << tag_4);
		mfc_read_tag_status_all();

D(printf("SPU[%d] index: %u  bitset_subsets: %u  offset: %u\n", param.proc, inbox, param.bitset_subsets, offset);
printf("SPU[%d]\t&use: %p\n\t&def: %p\n\t&out: %p\n\t&in:  %p\n", param.proc, (void*)param.bs_use_addr, (void*)param.bs_def_addr, (void*)param.bs_out_addr, (void*)param.bs_in_addr);
void *tmp_ptr = (void*) (param.bs_use_addr  + offset);
printf("SPU[%d] read\t\t&%p = use(%p)={", param.proc, (void*)use, tmp_ptr);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(use, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n");
tmp_ptr = (void*) (param.bs_def_addr  + offset);
printf("SPU[%d] read\t\t&%p = def(%p)={", param.proc, (void*)def, tmp_ptr);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(def, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n");
tmp_ptr = (void*) (param.bs_out_addr  + offset);
printf("SPU[%d] read\t\t&%p = out(%p)={", param.proc, (void*)out, tmp_ptr);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(out, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n");
tmp_ptr = (void*) (param.bs_in_addr  + offset);
printf("SPU[%d] read\t\t&%p = in (%p)={", param.proc, (void*)in, tmp_ptr);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(in, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n"));
		bitset_megaop(param, in, out, use, def);		

D(printf("SPU[%d] calculated\tin={", param.proc);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(in, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n");)

		mfc_put(in, (unsigned int)  (param.bs_in_addr  +  offset), param.bitset_size, tag_1, 0, 0);
		mfc_write_tag_mask(1 << tag_1);
		mfc_read_tag_status_all();

		spu_write_out_intr_mbox(inbox);
	}
Esempio n. 12
0
void MakeNodes()
{
	uint put_tag[2];

	put_tag[0] = mfc_tag_reserve();
	put_tag[1] = mfc_tag_reserve();

	ushort b = 0;

	kdbuffer_t l_kdb ALIGNED(16);
	kdbuffer_t r_kdb ALIGNED(16);

	kdnode_t node ALIGNED(16);
	kdbuffer_t kdb	ALIGNED(16);
	DoubleBufInit(&aabb_db, 0, 0, sizeof(aabb_t), NUM_AABBS, aabbbuffer[0], aabbbuffer[1]);

	// printf("Empty? %i\n", BufferEmpty(&arg.kdbuffer[b]));

	while(! BufferEmpty(&arg.kdbuffer[b]) )
	{
		kdbuffer_t *pkdb = (kdbuffer_t*)arg.kdbuffer[b].buffer;
		int size = BufferNumElements(&arg.kdbuffer[b]);
		int i;
		
		BufferClear(&arg.aabb_buffer[1-b]);
		BufferClear(&arg.kdbuffer[1-b]);
	
		// printf("size %i\n", size);

		for(i=0; i < size; i++)
		{
			l_kdb.node = arg.curnode++;
			r_kdb.node = arg.curnode++;		

			memcpy_ls(&kdb, &pkdb[i], sizeof(kdbuffer_t));

			node.split = kdb.plane;
			node.axis =  kdb.axis;
			node.left =  l_kdb.node;
			node.right = r_kdb.node;	


			memcpy_ea(&arg.nodes[ kdb.node ], &node, sizeof(kdnode_t));


			KDBufferAllocate(&l_kdb, kdb.left_size, &arg.aabb_buffer[1-b]);

			if(curjob < arg.njobs)
				KDBufferAllocate(&r_kdb, kdb.right_size, &arg.job_aabb_buffer[curjob]);
			else
				KDBufferAllocate(&r_kdb, kdb.right_size, &arg.aabb_buffer[1-b]);

			KDPartitionAll(&kdb, &l_kdb, &r_kdb);

			if(l_kdb.depth == arg.maxdepth || l_kdb.size <= arg.maxleafsize)
			{
				total_leaf_size += l_kdb.count;

				l_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.leaf_aabb_buffer, l_kdb.aabb, l_kdb.count);
				BufferCopyToLS(&arg.leafbuffer, &l_kdb, 1);
			}			
			else
			{	
				BufferCopyToLS(&arg.kdbuffer[1-b], &l_kdb, 1);
			}

			if(r_kdb.depth == arg.maxdepth || r_kdb.size <= arg.maxleafsize)
			{
				total_leaf_size += r_kdb.count;
		
				if(curjob < arg.njobs)
				{
					r_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.job_leaf_aabb_buffer[curjob], r_kdb.aabb, r_kdb.count);
					BufferCopyToLS(&arg.job_leafbuffer[curjob], &r_kdb, 1);

					spu_mfcdma32(&arg.job_leafbuffer[curjob], (uint)arg.pjob_leafbuffer[curjob], sizeof(buffer_t), jobtag, MFC_PUT_CMD);					
					DmaWait(jobtag);
		
				}
				else
				{
					r_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.leaf_aabb_buffer, r_kdb.aabb, r_kdb.count);
					BufferCopyToLS(&arg.leafbuffer, &r_kdb, 1);

				}

			}
			else
			{
				if(curjob < arg.njobs)
				{
					BufferCopyToLS(&arg.job_kdbuffer[curjob], &r_kdb, 1);

					spu_mfcdma32(&arg.job_kdbuffer[curjob], (uint)arg.pjob_kdbuffer[curjob], sizeof(buffer_t), jobtag, MFC_PUT_CMD);					
					DmaWait(jobtag);

				}
				else
					BufferCopyToLS(&arg.kdbuffer[1-b], &r_kdb, 1);
			}


			/*
			if(curjob < njobs)
				KDBufferAllocate(&r_kdb, kdb[i].right_size, &jobs[curjob]->aabb_buffer[0]);
			else
				KDBufferAllocate(&r_kdb, kdb[i].right_size, &aabb_buffer[1-b]);

			KDPartition(&kdb[i], &l_kdb, &r_kdb);

			if(l_kdb.depth == maxdepth || l_kdb.size <= maxleafsize)
			{
				l_kdb.aabb = (aabb_t*)BufferCopyTo(&leaf_aabb_buffer, l_kdb.aabb, l_kdb.count);
				BufferCopyTo(&leafbuffer, &l_kdb, 1);
			}			
			else
				BufferCopyTo(&kdbuffer[1-b], &l_kdb, 1);

			if(r_kdb.depth == maxdepth || r_kdb.size <= maxleafsize)
			{
				if(curjob < njobs)
				{
					r_kdb.aabb = (aabb_t*)BufferCopyTo(&jobs[curjob]->leaf_aabb_buffer, r_kdb.aabb, r_kdb.count);
					BufferCopyTo(&jobs[curjob]->leafbuffer, &r_kdb, 1);
				}
				else
				{
					r_kdb.aabb = (aabb_t*)BufferCopyTo(&leaf_aabb_buffer, r_kdb.aabb, r_kdb.count);
					BufferCopyTo(&leafbuffer, &r_kdb, 1);
				}
			}
			else
			{
				if(curjob < njobs)
					BufferCopyTo(&jobs[curjob]->kdbuffer[0], &r_kdb, 1);
				else
					BufferCopyTo(&kdbuffer[1-b], &r_kdb, 1);
			}
			*/



			if(curjob < arg.njobs)
			{
				// Start other job
				
				ppe_post_sema(arg.sema[curjob]);
				curjob++;
			}


	
		}
	
	
		b =  1 - b;

	}

	while( curjob < arg.njobs)
	{
		ppe_post_sema(arg.sema[curjob]);
		curjob++;
	}



	// Transfer back
	spu_mfcdma32(&arg.curnode, (unsigned int)arg.pcurnode, (unsigned int)sizeof(int), put_tag[0], MFC_PUT_CMD);
	spu_mfcdma32(&total_leaf_size, (unsigned int)arg.ptotal_leaf_size, (unsigned int)sizeof(int), put_tag[1], MFC_PUT_CMD);
	

	DmaWait(put_tag[0]);
	DmaWait(put_tag[1]);


	spu_mfcdma32(&arg.leafbuffer, (unsigned int)arg.pleafbuffer, (unsigned int)sizeof(buffer_t), put_tag[0], MFC_PUT_CMD);
	spu_mfcdma32(&arg.leaf_aabb_buffer, (unsigned int)arg.pleaf_aabb_buffer, (unsigned int)sizeof(buffer_t), put_tag[1], MFC_PUT_CMD);

	DmaWaitAll();

	mfc_tag_release(put_tag[0]);
	mfc_tag_release(put_tag[1]);

}
void process_buffer(int buffer, int cnt, vector float dt_v)
{
  int i;
  volatile vector float *p_inv_mass_v;
  vector float force_v, inv_mass_v;
  vector float pos0, pos1, pos2, pos3;
  vector float vel0, vel1, vel2, vel3;
  vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3;
  vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
  vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7};
  vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11};
  vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15};

  p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; 
  force_v = ctx.force_v;

  // Compute the step in time for the block of particles, four 
  // particle at a time.
  for (i=0; i<cnt; i+=4) {
    inv_mass_v = *p_inv_mass_v++;
    
    pos0 = pos[buffer][i+0];
    pos1 = pos[buffer][i+1];
    pos2 = pos[buffer][i+2];
    pos3 = pos[buffer][i+3];

    vel0 = vel[buffer][i+0];
    vel1 = vel[buffer][i+1];
    vel2 = vel[buffer][i+2];
    vel3 = vel[buffer][i+3];

    dt_inv_mass_v = spu_mul(dt_v, inv_mass_v);

    pos0 = spu_madd(vel0, dt_v, pos0);
    pos1 = spu_madd(vel1, dt_v, pos1);
    pos2 = spu_madd(vel2, dt_v, pos2);
    pos3 = spu_madd(vel3, dt_v, pos3);

    dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0);
    dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1);
    dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2);
    dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3);

    vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0);
    vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1);
    vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2);
    vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3);

    pos[buffer][i+0] = pos0;
    pos[buffer][i+1] = pos1;
    pos[buffer][i+2] = pos2;
    pos[buffer][i+3] = pos3;

    vel[buffer][i+0] = vel0;
    vel[buffer][i+1] = vel1;
    vel[buffer][i+2] = vel2;
    vel[buffer][i+3] = vel3;
  }
}


int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv)
{
  int buffer, next_buffer;
  int cnt, next_cnt, left;
  float time, dt;
  vector float dt_v;
  volatile vector float *ctx_pos_v, *ctx_vel_v;
  volatile vector float *next_ctx_pos_v, *next_ctx_vel_v;
  volatile float *ctx_inv_mass, *next_ctx_inv_mass;
  unsigned int tags[2];

  // Reserve a pair of DMA tag IDs
  tags[0] = mfc_tag_reserve();
  tags[1] = mfc_tag_reserve();
  
  // Input parameter argv is a pointer to the particle context.
  // Fetch the parameter context, waiting for it to complete.
  spu_writech(MFC_WrTagMask, 1 << tags[0]);
  spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt = ctx.dt;
  dt_v = spu_splats(dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += dt) {
    // For each double buffered block of particles
    left = ctx.particles;

    cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

    ctx_pos_v = ctx.pos_v;
    ctx_vel_v = ctx.vel_v;
    ctx_inv_mass = ctx.inv_mass;

    // Prefetch first buffer of input data.
    buffer = 0;
    spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD);
    spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD);
    spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD);

    while (cnt < left) {
      left -= cnt;

      next_ctx_pos_v = ctx_pos_v + cnt;
      next_ctx_vel_v = ctx_vel_v + cnt;
      next_ctx_inv_mass = ctx_inv_mass + cnt;
      next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Prefetch next buffer so the data is available for computation on next loop iteration.
      // The first DMA is barriered so that we don't GET data before the previous iteration's
      // data is PUT.
      next_buffer = buffer^1;

      spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD);
      spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD);
      spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD);
      
      // Wait for previously prefetched data
      spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      process_buffer(buffer, cnt, dt_v);

      // Put the buffer's position and velocity data back into system memory
      spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      
      ctx_pos_v = next_ctx_pos_v;
      ctx_vel_v = next_ctx_vel_v;
      ctx_inv_mass = next_ctx_inv_mass;

      buffer = next_buffer;
      cnt = next_cnt;		  
    }

    // Wait for previously prefetched data
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

    process_buffer(buffer, cnt, dt_v);

    // Put the buffer's position and velocity data back into system memory
    spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
    spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);

    // Wait for DMAs to complete before starting the next step in time.
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  }

  return (0);
}
Esempio n. 14
0
int
main(
    unsigned long long spe_id,
    unsigned long long ppu_vector_a,
    unsigned long long ppu_vector_b)
{
    int i, iter, buf_idx, vec_idx;
    unsigned long long ppu_vector_bases[2] _ALIG(128);
    vector float * pchunk_a, * pchunk_b;
    vector float g_vec = {0,0,0,0};

    ppu_vector_bases[0] = ppu_vector_a;
    ppu_vector_bases[1] = ppu_vector_b;

    const unsigned int spu_num = spu_read_in_mbox();
    unsigned long long get_edge_bytes = spu_num * SUBVEC_SZ_BYTES;

    float buffers[NBUFFERS * BUF_SZ_FLOATS] _ALIG(128);
    int buffer_tags[NBUFFERS][2] _ALIG(128);
    //int buffer_tags[NBUFFERS];

    for (iter = 0; iter < NBUFFERS; ++iter) {
        buffer_tags[iter][0] = mfc_tag_reserve();
        buffer_tags[iter][1] = mfc_tag_reserve();
    }

    // first mfc_get for all
    for (buf_idx = 0; buf_idx < NBUFFERS; ++buf_idx) {
        for (vec_idx = 0; vec_idx < 2; ++vec_idx) {
            mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx),
                    ppu_vector_bases[vec_idx] + get_edge_bytes,
                    CHUNK_SZ_BYTES,
                    buffer_tags[buf_idx][vec_idx],
                    0, 0);
        }
    }
    get_edge_bytes += CHUNK_SZ_BYTES;

    //printf("subvec_sz-chunks: %d\n", SUBVEC_SZ_CHUNKS);
    //printf("%d==%d\n", MAXITER*NBUFFERS*CHUNK_SZ_FLOATS, SUBVEC_SZ_FLOATS);
    int chunksleft = SUBVEC_SZ_CHUNKS;
    while(chunksleft!=0) {
        for (buf_idx = 0; chunksleft !=0 && buf_idx < NBUFFERS; ++buf_idx) {
            const int tag_mask = (1 << buffer_tags[buf_idx][0])
                                 | (1 << buffer_tags[buf_idx][1]);

            mfc_write_tag_mask(tag_mask);
            mfc_read_tag_status_all();

            pchunk_a = buf_ptr_vecfloat(buffers, buf_idx, 0);
            pchunk_b = buf_ptr_vecfloat(buffers, buf_idx, 1);

            for (i = 0; i < CHUNK_SZ_FLOATVECS; ++i) {
                g_vec = spu_madd(pchunk_a[i], pchunk_b[i], g_vec);
            }

            // move this mfc_get to end of loop, check get_edge_bytes variable dynamics
            if (likely(iter != MAXITER - 1)) {
                for (vec_idx = 0; vec_idx < 2; ++vec_idx) {
                    mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx),
                            ppu_vector_bases[vec_idx] + get_edge_bytes,
                            CHUNK_SZ_BYTES,
                            buffer_tags[buf_idx][vec_idx],
                            0, 0);
                }
            }
            get_edge_bytes += CHUNK_SZ_BYTES;
            --chunksleft;
        }
    }

    for (iter = 0; iter < NBUFFERS; ++iter) {
        mfc_tag_release(buffer_tags[iter][0]);
        mfc_tag_release(buffer_tags[iter][1]);
    }

    float_uint_t retval;
    retval.f =
        spu_extract(g_vec, 0) +
        spu_extract(g_vec, 1) +
        spu_extract(g_vec, 2) +
        spu_extract(g_vec, 3);

    //printf("retval: %f\n", retval.f);
    spu_write_out_mbox(retval.i);

    return 0;
}
Esempio n. 15
0
void setup_spu(unsigned int spu_ctrlblock_addr){
  ctrl_dma_tag = mfc_tag_reserve();

  // Get SPU control block
  mfc_get(&spu_ctrlblock,
	  spu_ctrlblock_addr,
	  sizeof(spu_ctrlblock),
	  ctrl_dma_tag,
	  0,0);

  mfc_write_tag_mask(1<<ctrl_dma_tag);
  mfc_read_tag_status_all();

  mcb = (merger_ctrlblock_t*)memalign(128,spu_ctrlblock.num_mergers * sizeof(merger_ctrlblock_t) );
  md = (merger_data_t*)malloc(spu_ctrlblock.num_mergers * sizeof(merger_data_t));

  // Set addresses
  int i;
  for(i = 0; i < spu_ctrlblock.num_mergers; i++){
    // Set head/tail vector addresses
    mcb[i].idx_addr[LEFT] = (unsigned int) &md[i].idx[LEFT][HEAD];
    mcb[i].idx_addr[RIGHT] = (unsigned int) &md[i].idx[RIGHT][HEAD];
    mcb[i].idx_addr[OUT] = (unsigned int) &md[i].idx[PARENT][TAIL];
  }

  // Send merger control blocks
  mfc_put(mcb,
	  spu_ctrlblock.ctrlblocks_addr,
	  spu_ctrlblock.num_mergers * sizeof(merger_ctrlblock_t),
	  ctrl_dma_tag,
	  0,0);

  mfc_read_tag_status_all();

  // Mail PPU telling it we've set the addresses
  spu_write_out_mbox(1);

  // Wait for go-ahead mail
  spu_read_in_mbox();

  // Get merger blocks
  mfc_get(mcb,
	  spu_ctrlblock.ctrlblocks_addr,
	  spu_ctrlblock.num_mergers * sizeof(merger_ctrlblock_t),
	  ctrl_dma_tag,
	  0,0);

  mfc_read_tag_status_all();

  int buffer_idx = 0;
  for(i = 0; i < spu_ctrlblock.num_mergers; i++){
    // Add start address of buffer array to all block addresses
    if(mcb[i].id != 0)
      mcb[i].block_addr[OUT] += (unsigned int) &buffer[0];

    if(!mcb[i].leaf_node){
      mcb[i].block_addr[LEFT] += (unsigned int) &buffer[0];
      mcb[i].block_addr[RIGHT] += (unsigned int) &buffer[0];
    }

    // Setup merger data
    md[i].held_tag[LEFT] = 32;
    md[i].held_tag[RIGHT] = 32;
    md[i].held_tag[OUT] = 32;
    md[i].num_pulled[LEFT] = 0;
    md[i].num_pulled[RIGHT] = 0;
    md[i].mm_depleted[LEFT] = 0;
    md[i].mm_depleted[RIGHT] = 0;
    md[i].depleted[LEFT] = 0;
    md[i].depleted[RIGHT] = 0;
    md[i].done = 0;
    md[i].consumed[LEFT] = 0;
    md[i].consumed[RIGHT] = 0;

    md[i].idx[LEFT][HEAD] = spu_splats(0);
    md[i].idx[LEFT][TAIL] = spu_splats(0);
    md[i].idx[RIGHT][HEAD] = spu_splats(0);
    md[i].idx[RIGHT][TAIL] = spu_splats(0);
    md[i].idx[OUT][HEAD] = spu_splats(0);
    md[i].idx[OUT][TAIL] = spu_splats(0);
    md[i].idx[PARENT][HEAD] = spu_splats(0);
    md[i].idx[PARENT][TAIL] = spu_splats(0);

    md[i].buffer[LEFT] = &buffer[buffer_idx];
    buffer_idx += mcb[i].buffer_size[LEFT];
    md[i].buffer[RIGHT] = &buffer[buffer_idx];
    buffer_idx += mcb[i].buffer_size[RIGHT];
    md[i].buffer[OUT] = &buffer[buffer_idx];
    buffer_idx += mcb[i].buffer_size[OUT];
  }

  // Setup internal nodes
  for(i = 0; i < spu_ctrlblock.num_mergers; i++){
    if(mcb[i].local[OUT] < 255){
      int parent_idx = mcb[i].local[OUT];
      int side = (mcb[i].id+1)&1;
      md[i].buffer[OUT] = md[parent_idx].buffer[side];
      mcb[i].buffer_size[OUT] = mcb[parent_idx].buffer_size[side];
    }
  }
}
Esempio n. 16
0
void push(){
  int avail_out = num_in_buffer(OUT);
  if(!avail_out)
    return;  

  int avail_parent = num_free_in_buffer(PARENT);
  if(mcb[am].id == 0)
    avail_parent = mcb[am].data_size[LEFT] + mcb[am].data_size[RIGHT];

  int num_send = avail_out < avail_parent ? avail_out : avail_parent;
  num_send = num_send < MAX_DMA_SIZE ? num_send : MAX_DMA_SIZE;
  if(!num_send)
    return;

  int tag = mfc_tag_reserve();
  if(tag == MFC_TAG_INVALID){
    return;
  } else
    md[am].held_tag[OUT] = tag;

  // send num_send vectors, in up to three DMA-put's
  while(num_send > 0){
    int parent_head = spu_extract(md[am].idx[PARENT][HEAD],0);
    int free_from_head = mcb[am].buffer_size[PARENT] - parent_head;

    int tail = spu_extract(md[am].idx[OUT][TAIL],0);
    int avail_from_tail = mcb[am].buffer_size[OUT] - tail;

    int part_send = num_send < free_from_head ? num_send : free_from_head;
    part_send = part_send < avail_from_tail ? part_send : avail_from_tail;

    unsigned int to = mcb[am].block_addr[OUT] + parent_head*sizeof(vector signed int);

    mfc_put(&md[am].buffer[OUT][tail],
	    to,	    
	    part_send * sizeof(vector signed int),
	    md[am].held_tag[OUT],
	    0,0);
    
    md[am].idx[PARENT][HEAD] = spu_add(md[am].idx[PARENT][HEAD], part_send);
    parent_head = spu_extract(md[am].idx[PARENT][HEAD],0);

    if(parent_head == mcb[am].buffer_size[PARENT])
      md[am].idx[PARENT][HEAD] = spu_splats(0);

    md[am].idx[OUT][TAIL] = spu_add(md[am].idx[OUT][TAIL], part_send);
    tail = spu_extract(md[am].idx[OUT][TAIL],0);

    if(tail == mcb[am].buffer_size[OUT])
      md[am].idx[OUT][TAIL] = spu_splats(0);

    num_send -= part_send;
  }

  // Inner nodes updates parent in buffer head idx
  if(mcb[am].id)
    mfc_putf(&md[am].idx[PARENT][HEAD],
	     mcb[am].idx_addr[OUT],	     
	     sizeof(vector signed int),
	     md[am].held_tag[OUT],
	     0,0);
}