Exemple #1
0
void check_pull_dma(int side){
  // Check left
  if(md[am].held_tag[side] < 32){
    mfc_write_tag_mask( 1 << md[am].held_tag[side] );
    int status = mfc_read_tag_status_immediate();

    if(status){
      // Update idx
      md[am].idx[side][HEAD] = spu_add(md[am].idx[side][HEAD], md[am].num_waiting[side]);

      vector signed int buffer_size = spu_splats(mcb[am].buffer_size[side] -1);
      vector unsigned int cmp_v = spu_cmpgt(md[am].idx[side][HEAD], buffer_size);
      vector signed int zeros = {0,0,0,0};
      buffer_size = spu_add(buffer_size,1);
      zeros = spu_sel(zeros,buffer_size,cmp_v);
      md[am].idx[side][HEAD] = spu_sub(md[am].idx[side][HEAD],zeros);

      md[am].num_pulled[side] += md[am].num_waiting[side];      
      md[am].num_waiting[side] = 0;
      if(md[am].num_pulled[side] == mcb[am].data_size[side]){
	md[am].mm_depleted[side] = 1;
      }
      // Release tag
      mfc_tag_release( md[am].held_tag[side] );
      md[am].held_tag[side] = 32;      
    }
  }
}
Exemple #2
0
inline vector float SAHCostSIMD(vector float invarea, vector float ctravers, vector float cleft, vector float aleft, vector float cright, vector float aright)
{
	vector float l = spu_mul(cleft, spu_mul(aleft, invarea));
	vector float r = spu_mul(cright, spu_mul(aright, invarea));

	return spu_add(ctravers, spu_add(l, r));
}
Exemple #3
0
inline void merge_cache_blocks(RenderableCacheLine* cache)
{
    vec_uchar16 next = cache->chunkNext;

    for (;;) {
        vec_uchar16 nextnext = spu_shuffle(next, next, next);
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        vec_ushort8 firstblock0 = spu_cmpeq( cache->chunkStart[0], 0);
        vec_ushort8 firstblock1 = spu_cmpeq( cache->chunkStart[1], 0);
        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 first = (vec_uchar16) spu_shuffle( firstblock0, firstblock1, firstshuf );

        vec_ushort8 tri0 = cache->chunkTriangle[0];
        vec_ushort8 tri1 = cache->chunkTriangle[1];
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 ntri0 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 ntri1 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 trieq0 = spu_cmpeq( tri0, ntri0 );
        vec_ushort8 trieq1 = spu_cmpeq( tri1, ntri1 );

        vec_uchar16 trieq = (vec_uchar16) spu_shuffle( trieq0, trieq1, MERGE );
        vec_uchar16 combi = spu_orc(first, trieq);

        vec_uchar16 canmerge = spu_cmpgt( spu_nor(spu_or(next, nextnext), combi), 256-CHUNKNEXT_BUSY_BIT );

        vec_uint4 gather = spu_gather( canmerge );

        vec_uint4 mergeid = spu_sub( spu_cntlz( gather ), spu_promote((unsigned int)16, 0));

        if( !spu_extract(gather, 0) ) {
            return;
        }

        //	unsigned int firstchunk = spu_extract(mergeid, 0);
        //	unsigned int nextchunk = cache->chunkNextArray[firstchunk];
        vec_uint4 v_chunkNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(mergeid,13) );
        vec_uint4 v_chunkNextNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(v_chunkNext,13) );

        // cache->chunkNextArray[firstchunk] = cache->chunkNextArray[nextchunk];
        next = spu_shuffle( (vec_uchar16) v_chunkNextNext, next, (vec_uchar16) si_cbd( (qword) mergeid, 0 ) );

        // cache->chunkNextArray[nextchunk] = CHUNKNEXT_FREE_BLOCK;
        next = spu_shuffle( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK), next, (vec_uchar16) si_cbd( (qword) v_chunkNext, 0 ) );

        // this is for debug use only, it's not really needed...
        // cache->chunkStartArray[nextchunk] = -1;
        cache->chunkStartArray[ spu_extract(v_chunkNext,0) & 255 ] = -1;

        cache->chunkNext = next;
    }
}
Exemple #4
0
void add()
{
	int i, j, n;
	
	n = SIZE * sizeof(float);

	for (i = 0; (i + SIZE) < args.N; i += SIZE) {
		mfc_get((void *)&ls1[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();

		for (j = 0; j < (SIZE / 4); ++j)
			ls3[j] = spu_add(ls1[j], ls2[j]);

		mfc_put((void *)&ls3[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
	}
		
	mfc_write_tag_mask(1 << TAG);
	mfc_read_tag_status_all();

	if (unlikely(i < args.N)) {
		/* 
		 * args.N - i will be smaller than SIZE at this point so
		 * it is safe to do a DMA transfer.
		 * We need to make sure that size is a multiple of 16.
		 */
		n = ((args.N - i) * sizeof(float)) & (~127);

		mfc_get((void *)&ls1[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
		
		/* n must be divisible by 4. */
		for (j = 0; j < ((args.N - i) / 4); ++j)
			ls3[j] = spu_add(ls1[j], ls2[j]);
		
		mfc_put((void *)&ls3[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
	}

	/* 
	 * At this point it may be that i is still smaller than args.N if the length
	 * was not divisible by the number of SPUs times 16.
	 */
}
Exemple #5
0
void process_data_simd (float* buf_in, float* buf_out, unsigned int size)
{
  unsigned int i;
  vector float *vbuf_in, *vbuf_out;
  vector float v1 = (vector float) {1.0f, 1.0f, 1.0f, 1.0f};
  vbuf_in = (vector float*) buf_in;
  vbuf_out = (vector float*) buf_out;

  for (i = 0; i < (size / 4); i++)
  {
   vbuf_out[i] = spu_add (vbuf_in[i], v1); 
  }
}

/*
 * This routine fills a dma list with the appropriate effective address 
 * and size for the dma list element. 
 *
 * @param dma_list:     the dma list to be filled
 * @param num_elements: number of elements in list
 * @param base_addr:    base effective address
 * @param elem size:    size of each dma element
 *
 */
void fill_dma_list (mfc_list_element_t* list, int num_elements, unsigned long long base_addr, unsigned int elem_size)
{
  int i;
  for (i = 0; i < num_elements; i++)
  {
    list[i].notify = 0;
    list[i].size = elem_size;
    list[i].eal = base_addr + i*elem_size;
  }
}
unsigned int
__mfc_multi_tag_reserve (unsigned int number_of_tags)
{
  vector unsigned int table_copy;
  vector unsigned int one = (vector unsigned int)
        { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
  vector unsigned int count_busy, is_valid;
  vector unsigned int count_total;
  vector unsigned int count_avail = (vector unsigned int) { 0, 0, 0, 0 };
  vector unsigned int index = (vector unsigned int) { 0, 0, 0, 0 };

  table_copy = __mfc_tag_table;


  /* count_busy: number of consecutive busy tags
     count_avail: number of consecutive free tags
     table_copy: temporary copy of the tag table
     count_total: sum of count_busy and count_avail
     index: index of the current working tag  */
  do
    {
      table_copy = spu_sl (table_copy, count_avail);

      count_busy = spu_cntlz (table_copy);
      table_copy = spu_sl (table_copy, count_busy);
      count_avail = spu_cntlz (spu_xor(table_copy, -1));
      count_total = spu_add (count_busy, count_avail);
      index = spu_add (index, count_total);
    }
  while (spu_extract (count_avail, 0) < number_of_tags
	 && spu_extract (table_copy, 0) != 0);

  index = spu_sub (index, count_avail);

  /* is_valid is set to 0xFFFFFFFF if table_copy == 0, 0 otherwise.  */
  is_valid = spu_cmpeq (table_copy, 0);
  index = spu_sel (index, is_valid, is_valid);

  /* Now I need to actually mark the tags as used.  */
  table_copy = spu_sl (one, number_of_tags);
  table_copy = spu_rl (table_copy, -number_of_tags - spu_extract (index, 0));
  table_copy = spu_sel (table_copy, __mfc_tag_table, table_copy);
  __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_valid);

  return spu_extract (index, 0);
}
Exemple #7
0
void cp_buffer(int side){
  int avail_out = num_free_in_buffer(OUT);
  int avail_side = num_in_buffer(side);
  int max = avail_out < avail_side ? avail_out : avail_side;

  vector signed int *out_head;
  if(mcb[am].local[OUT] < 255)
    out_head = (vector signed int*) &md[ mcb[am].local[OUT] ].idx[ (mcb[am].id+1)&1 ][HEAD];
  else
    out_head = (vector signed int*) &md[am].idx[OUT][HEAD];

  vector unsigned int cmp_v;
  vector signed int from_size = spu_splats( mcb[am].buffer_size[side] );
  vector signed int out_size = spu_splats( mcb[ mcb[am].local[OUT] ].buffer_size[ (mcb[am].id+1)&1 ] );
  vector signed int ones = {1,1,1,1};
  vector signed int zeros = {0,0,0,0};

  int i;
  for(i = 0; i < max; i++){
    md[am].buffer[OUT][spu_extract( *out_head,0)] = md[am].buffer[side][spu_extract(md[am].idx[side][TAIL],0)];
    // update idx
    md[am].idx[side][TAIL] = spu_add(md[am].idx[side][TAIL], ones);
    cmp_v = spu_cmpeq(md[am].idx[side][TAIL],from_size);
    md[am].idx[side][TAIL] = spu_sel(md[am].idx[side][TAIL], zeros, cmp_v);

    *out_head = spu_add(*out_head,ones);
    cmp_v = spu_cmpeq(*out_head, out_size);
    *out_head = spu_sel(*out_head,zeros,cmp_v);
  }

  update_tail(side);

  md[am].consumed[side] += max;

  if(mcb[am].local[OUT] < 255 && md[am].consumed[side] == mcb[am].data_size[side]){
    md[am].depleted[side] = 1;
    md[am].done = 1;
    --num_active_mergers;
  }
}
Exemple #8
0
void push(){
  int avail_out = num_in_buffer(OUT);
  if(!avail_out)
    return;  

  int avail_parent = num_free_in_buffer(PARENT);
  if(mcb[am].id == 0)
    avail_parent = mcb[am].data_size[LEFT] + mcb[am].data_size[RIGHT];

  int num_send = avail_out < avail_parent ? avail_out : avail_parent;
  num_send = num_send < MAX_DMA_SIZE ? num_send : MAX_DMA_SIZE;
  if(!num_send)
    return;

  int tag = mfc_tag_reserve();
  if(tag == MFC_TAG_INVALID){
    return;
  } else
    md[am].held_tag[OUT] = tag;

  // send num_send vectors, in up to three DMA-put's
  while(num_send > 0){
    int parent_head = spu_extract(md[am].idx[PARENT][HEAD],0);
    int free_from_head = mcb[am].buffer_size[PARENT] - parent_head;

    int tail = spu_extract(md[am].idx[OUT][TAIL],0);
    int avail_from_tail = mcb[am].buffer_size[OUT] - tail;

    int part_send = num_send < free_from_head ? num_send : free_from_head;
    part_send = part_send < avail_from_tail ? part_send : avail_from_tail;

    unsigned int to = mcb[am].block_addr[OUT] + parent_head*sizeof(vector signed int);

    mfc_put(&md[am].buffer[OUT][tail],
	    to,	    
	    part_send * sizeof(vector signed int),
	    md[am].held_tag[OUT],
	    0,0);
    
    md[am].idx[PARENT][HEAD] = spu_add(md[am].idx[PARENT][HEAD], part_send);
    parent_head = spu_extract(md[am].idx[PARENT][HEAD],0);

    if(parent_head == mcb[am].buffer_size[PARENT])
      md[am].idx[PARENT][HEAD] = spu_splats(0);

    md[am].idx[OUT][TAIL] = spu_add(md[am].idx[OUT][TAIL], part_send);
    tail = spu_extract(md[am].idx[OUT][TAIL],0);

    if(tail == mcb[am].buffer_size[OUT])
      md[am].idx[OUT][TAIL] = spu_splats(0);

    num_send -= part_send;
  }

  // Inner nodes updates parent in buffer head idx
  if(mcb[am].id)
    mfc_putf(&md[am].idx[PARENT][HEAD],
	     mcb[am].idx_addr[OUT],	     
	     sizeof(vector signed int),
	     md[am].held_tag[OUT],
	     0,0);
}
Exemple #9
0
void MinMaxBinFindBest3SIMD(minmaxbin_t *mmb, kdbuffer_t *result)
{
	int i;

	for(i=1; i < mmb->numbins; i++)
	{
		int j = mmb->numbins - i - 1;

		vector float *min = (vector float *)mmb->minbins[i].b;
		vector float *max = (vector float *)mmb->maxbins[j].b;

		min[0] = spu_add(min[0], min[-1]);
		max[0] = spu_add(max[0], max[1]);
	}

	vector float *vmax = (vector float*)result->baabb.max;
	vector float *vmin = (vector float*)result->baabb.min;

	vector float vwidth = spu_abs( spu_sub(*vmax, *vmin) );

	vector float vnumbins = spu_splats(1/(float)mmb->numbins);
	vector float vdelta = spu_mul(vwidth, vnumbins);
	vector float vx = spu_add(*vmin, vdelta);

	vector float vside = { vwidth[1] * vwidth[2], vwidth[0] * vwidth[2], vwidth[0] * vwidth[1], 0 };
	vector float invarea = spu_splats( 1/(vwidth[0] * vside[0]));
	vector float vctravers = spu_splats(2.0f);
	vector float vbestcost = spu_splats(mmb->bestcost);
	vector int vbesti = spu_splats(0);
	vector float vbestx = vx;

	for(i=0; i < mmb->numbins-1; i++)
	{
		vector float aleft, aright;

		AreaLeftRight(*vmin, *vmax, vside, vx, &aleft, &aright);

		vector float *vminbin = (vector float *)mmb->minbins[i].b;
		vector float *vmaxbin = (vector float *)mmb->maxbins[i+1].b;

		vector float cost = SAHCostSIMD(invarea, vctravers, *vminbin, aleft, *vmaxbin, aright);

		vector unsigned int cmp = spu_cmpgt(cost, vbestcost);
		vbestcost = spu_sel(cost, vbestcost, cmp);
		vbesti = spu_sel(spu_splats(i), vbesti, cmp);
		vbestx = spu_sel(vx, vbestx, cmp);

		vx = spu_add(vx, vdelta);	
	}	

	int axis = 0;
	float bestcost = vbestcost[axis];

	if(vbestcost[1] < bestcost)
	{
		axis = 1;
		bestcost = vbestcost[1];
	}

	if(vbestcost[2] < bestcost)
	{
		axis = 2;
		bestcost = vbestcost[2];
	}

	int index = vbesti[axis];

	result->plane = vbestx[axis];
	result->axis = axis;
	result->left_size = (int)mmb->minbins[ index ].b[axis];
	result->right_size = (int)mmb->maxbins[ index+1 ].b[axis];
	
	mmb->bestcost = vbestcost[axis];
}
Exemple #10
0
void process_data_simd (float* buf_in, float* buf_out, unsigned int size)
{
  unsigned int i;
  vector float *vbuf_in, *vbuf_out;
  vector float v1 = (vector float) {1.0f, 1.0f, 1.0f, 1.0f};
  vbuf_in = (vector float*) buf_in;
  vbuf_out = (vector float*) buf_out;

  for (i = 0; i < (size / 4); i++)
  {
   vbuf_out[i] = spu_add (vbuf_in[i], v1); 
  }
}

int main(unsigned long long speid __attribute__ ((unused)),
         unsigned long long argp, 
         unsigned long long envp __attribute__ ((unused)))
{
  unsigned int tag;
  unsigned long long in_addr, out_addr;
  int i, num_chunks;

#ifdef USE_TIMER
  uint64_t start, time_working;
  spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih);
  spu_clock_start();
  start = spu_clock_read();
#endif /* USE_TIMER */

  /* First, we reserve a MFC tag for use */
  tag = mfc_tag_reserve();
  if (tag == MFC_TAG_INVALID)
  {
    printf ("SPU ERROR, unable to reserve tag\n");
    return 1;
  }

  /* issue DMA transfer to get the control block information from 
   * system memory */
  mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0);

  /* wait for the DMA to complete */ 
  mfc_write_tag_mask (1 << tag);
  mfc_read_tag_status_all ();

  /* calculate the number of blocks (chunks) that this spe is assigned 
   * to process */
  num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE;

  /*
   * This is the main loop.  We basically goes through the num_chunks
   * and fetches one 'chunk' of data at a time, process it, and write 
   * it back to system memory until done. 
   */
  for (i = 0; i < num_chunks; i++)
  {
    /* set the in_addr and out_addr variables, we will use these for
     * issuing DMA get and put commands */
    in_addr = control_block.in_addr + (i* CHUNK_SIZE * sizeof(float));
    out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof(float));

    /* issue a DMA get command to fetch the chunk of data from system memory */
    mfc_get (local_buffer_in, in_addr, CHUNK_SIZE * sizeof(float), 
        tag, 0, 0);

    /* wait for the DMA get to complete */ 
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();


    /* invoke process_data to work on the data that's just been moved into 
     * local store*/
    process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE);

   /* issue the DMA put command to transfer result from local memory to 
    * system memory */
    mfc_put (local_buffer_out, out_addr, CHUNK_SIZE * sizeof(float), 
        tag, 0, 0);

    /* wait for the DMA put to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();
  }
#ifdef USE_TIMER
  time_working = (spu_clock_read() - start);
  spu_clock_stop();
  printf ("SPE time_working = %lld\n", time_working);
#endif /* USE_TIMER */

  return 0;
}
Exemple #11
0
static inline void build_blit_list(
		vec_uint4* dma_list_buffer,
		unsigned long eal, unsigned long stride)
{
#ifdef DEBUG_1
	printf("build_blit_list: eal=%lx stride=%d\n", eal, stride);
#endif

	unsigned long eal1 = eal + stride;
	unsigned long stride2 = 2 * stride;
	unsigned long stride8 = 8 * stride;
	vec_uint4 block0 = { 128, eal, 128, eal1 };
	vec_uint4 step2 = { 0, stride2, 0, stride2};
	vec_uint4 step4 = spu_add(step2, step2);
	vec_uint4 step6 = spu_add(step4, step2);
	vec_uint4 step8 = { 0, stride8, 0, stride8};
	vec_uint4 step16 = spu_add(step8, step8);
	vec_uint4 block2 = spu_add(block0, step2);
	vec_uint4 block4 = spu_add(block0, step4);
	vec_uint4 block6 = spu_add(block0, step6);
	vec_uint4 block8 = spu_add(block0, step8);
	vec_uint4 block10 = spu_add(block8, step2);
	vec_uint4 block12 = spu_add(block8, step4);
	vec_uint4 block14 = spu_add(block8, step6);
	vec_uint4 block16 = spu_add(block8, step8);
	vec_uint4 block18 = spu_add(step16, block2);
	vec_uint4 block20 = spu_add(step16, block4);
	vec_uint4 block22 = spu_add(step16, block6);
	vec_uint4 block24 = spu_add(step16, block8);
	vec_uint4 block26 = spu_add(step16, block10);
	vec_uint4 block28 = spu_add(step16, block12);
	vec_uint4 block30 = spu_add(step16, block14);

	dma_list_buffer[0] = block0;
	dma_list_buffer[1] = block2;
	dma_list_buffer[2] = block4;
	dma_list_buffer[3] = block6;
	dma_list_buffer[4] = block8;
	dma_list_buffer[5] = block10;
	dma_list_buffer[6] = block12;
	dma_list_buffer[7] = block14;

	dma_list_buffer[8] = block16;
	dma_list_buffer[9] = block18;
	dma_list_buffer[10] = block20;
	dma_list_buffer[11] = block22;
	dma_list_buffer[12] = block24;
	dma_list_buffer[13] = block26;
	dma_list_buffer[14] = block28;
	dma_list_buffer[15] = block30;

#ifdef DEBUG_1
	int i,j;
	for (i=0;i<16;i++) {
		printf("%lx(%x): ", &dma_list_buffer[i], i);
		for (j=0; j<4; j++) {
			printf("%lx ", spu_extract(dma_list_buffer[i], j));
		}
		printf("\n");
	}
#endif
}
Exemple #12
0
inline vector real_t
advec_diff_v(vector real_t cell_size,
             vector real_t c2l, vector real_t w2l, vector real_t d2l, 
             vector real_t c1l, vector real_t w1l, vector real_t d1l, 
             vector real_t   c, vector real_t   w, vector real_t   d, 
             vector real_t c1r, vector real_t w1r, vector real_t d1r, 
             vector real_t c2r, vector real_t w2r, vector real_t d2r)
{    
    vector real_t acc1, acc2, acc3;
    vector real_t wind, diff_term, advec_term;
    vector real_t advec_term_pos, advec_term_neg;
    vector real_t advec_termR, advec_termL;
    
    acc1 = spu_add(w1l, w);
    wind = spu_mul(acc1, HALF);
    acc1 = spu_mul(c1l, FIVE);
    acc2 = spu_mul(c, TWO);
    advec_term_pos = spu_add(acc1, acc2);
    advec_term_pos = spu_sub(advec_term_pos, c2l);
    acc1 = spu_mul(c1l, TWO);
    acc2 = spu_mul(c, FIVE);
    advec_term_neg = spu_add(acc1, acc2);
    advec_term_neg = spu_sub(advec_term_neg, c1r);
    acc1 = (vector real_t)spu_cmpgt(wind, ZERO);
    acc1 = spu_and(acc1, advec_term_pos);
    acc2 = (vector real_t)spu_cmpgt(ZERO, wind);
    acc2 = spu_and(acc2, advec_term_neg);
    advec_termL = spu_add(acc1, acc2);
    advec_termL = spu_mul(advec_termL, SIXTH);
    advec_termL = spu_mul(advec_termL, wind);
    acc1 = spu_add(w1r, w);
    wind = spu_mul(acc1, HALF);
    acc1 = spu_mul(c, FIVE);
    acc2 = spu_mul(c1r, TWO);
    advec_term_pos = spu_add(acc1, acc2);
    advec_term_pos = spu_sub(advec_term_pos, c1l);
    acc1 = spu_mul(c, TWO);
    acc2 = spu_mul(c1r, FIVE);
    advec_term_neg = spu_add(acc1, acc2);
    advec_term_neg = spu_sub(advec_term_neg, c2r);
    acc1 = (vector real_t)spu_cmpgt(wind, ZERO);
    acc1 = spu_and(acc1, advec_term_pos);
    acc2 = (vector real_t)spu_cmpgt(ZERO, wind);
    acc2 = spu_and(acc2, advec_term_neg);
    advec_termR = spu_add(acc1, acc2);
    advec_termR = spu_mul(advec_termR, SIXTH);
    advec_termR = spu_mul(advec_termR, wind);
    acc1 = spu_sub(advec_termL, advec_termR);
    advec_term = VEC_DIVIDE(acc1, cell_size);
    acc1 = spu_add(d1l, d);
    acc1 = spu_mul(acc1, HALF);
    acc3 = spu_sub(c1l, c);
    acc1 = spu_mul(acc1, acc3);
    acc2 = spu_add(d, d1r);
    acc2 = spu_mul(acc2, HALF);
    acc3 = spu_sub(c, c1r);
    acc2 = spu_mul(acc2, acc3);
    acc1 = spu_sub(acc1, acc2);
    acc2 = spu_mul(cell_size, cell_size);
    diff_term = VEC_DIVIDE(acc1, acc2);
    return spu_add(advec_term, diff_term);
}
Exemple #13
0
int kernel(lwp_functions* pf,
	   void*             params,
	   void*             inout,
	   unsigned int      iter,
	   unsigned int      n)
{
  Ternary_params* p = (Ternary_params*)params;
  switch (p->cmd)
  {
    case AM:
    {
      int length = p->length / 4;
      vector float *a = (vector float *)inout;
      vector float *b = a + length;
      vector float *c = a + 2 * length;
      unsigned int i;
      for (i = 0; i != length; ++i, ++a, ++b, ++c)
	*a = spu_mul(spu_add(*a, *b), *c);
      return 0;
    }
    case MA:
    {
      int length = p->length / 4;
      vector float *a = (vector float *)inout;
      vector float *b = a + length;
      vector float *c = a + 2 * length;
      unsigned int i;
      for (i = 0; i != length; ++i, ++a, ++b, ++c)
	*a = spu_madd(*a, *b, *c);
      return 0;
    }
    case CAM:
    {
      static vector unsigned char lo = 
	(vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19,
				 4, 5, 6, 7, 20, 21, 22, 23};

      static vector unsigned char hi = 
	(vector unsigned char) { 8,  9, 10, 11, 24, 25, 26, 27,
				12, 13, 14, 15, 28, 29, 30, 31};

      int length = p->length / 4;
      float *a = (float *)inout;
      float *b = a + 8 * length;
      float *c = a + 16 * length;
      unsigned int i;
      // (a + b) * c:
      // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i
      // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r
      for (i = 0; i != length; ++i, a+=8, b+=8, c+=8)
      {
	vector float av = {*a, *(a+2), *(a+4), *(a+6)};              // a.r
	vector float bv = {*b, *(b+2), *(b+4), *(b+6)};              // b.r
	vector float cv = {*c, *(c+2), *(c+4), *(c+6)};              // c.r
	vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)};          // a.i
	vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)};          // b.i
	vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)};          // c.i
	vector float trv = spu_add(av, bv); // a.r+b.r
	vector float tiv = spu_add(dv, ev); // a.i+b.i
	vector float sv = spu_mul(trv, cv); // (a.r+b.r)*c.r
	vector float tv = spu_mul(trv, fv); // (a.r+b.r)*c.i
	vector float real = spu_nmsub(tiv, fv, sv); // r.r
	vector float imag = spu_madd(tiv, cv, tv);  // r.i
	// interleave result
	*(vector float *)a = spu_shuffle(real, imag, lo);
	*(vector float *)(a+4) = spu_shuffle(real, imag, hi);
      }
      return 0;
    }
    case CMA:
    {
      static vector unsigned char lo = 
	(vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19,
				 4, 5, 6, 7, 20, 21, 22, 23};

      static vector unsigned char hi = 
	(vector unsigned char) { 8,  9, 10, 11, 24, 25, 26, 27,
				12, 13, 14, 15, 28, 29, 30, 31};

      int length = p->length / 4;
      float *a = (float *)inout;
      float *b = a + 8 * length;
      float *c = a + 16 * length;
      unsigned int i;
      // a * b + c:
      // r.r = a.r*b.r + c.r - a.i*b.i
      // r.i = a.r*b.i + c.i + a.i*b.r
      for (i = 0; i != length; ++i, a+=8, b+=8, c+=8)
      {
	vector float av = {*a, *(a+2), *(a+4), *(a+6)};              // a.r
	vector float bv = {*b, *(b+2), *(b+4), *(b+6)};              // b.r
	vector float cv = {*c, *(c+2), *(c+4), *(c+6)};              // c.r
	vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)};          // a.i
	vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)};          // b.i
	vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)};          // c.i
	vector float real = spu_nmsub(dv, ev, spu_madd(av, bv, cv)); // r.r
	vector float imag = spu_madd(dv, bv, spu_madd(av, ev, fv));  // r.i
	// interleave result
	*(vector float *)a = spu_shuffle(real, imag, lo);
	*(vector float *)(a+4) = spu_shuffle(real, imag, hi);
      }
      return 0;
    }
    case ZAM:
    {
      int length = p->length / 4;
      float *a_re = (float *)inout;
      float *a_im = a_re + 4 * length;
      float *b_re = a_re + 8 * length;
      float *b_im = a_re + 12 * length;
      float *c_re = a_re + 16 * length;
      float *c_im = a_re + 20 * length;
      unsigned int i;
      // (a + b) * c:
      // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i
      // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r
      for (i = 0; i != length;
	   ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4)
      {
	vector float *av = (vector float *)a_re;
	vector float *bv = (vector float *)b_re;
	vector float *cv = (vector float *)c_re;
	vector float *dv = (vector float *)a_im;
	vector float *ev = (vector float *)b_im;
	vector float *fv = (vector float *)c_im;
	vector float trv = spu_add(*av, *bv); // a.r+b.r
	vector float tiv = spu_add(*dv, *ev); // a.i+b.i
	vector float sv = spu_mul(trv, *cv); // (a.r+b.r)*c.r
	vector float tv = spu_mul(trv, *fv); // (a.r+b.r)*c.i
	*av = spu_nmsub(tiv, *fv, sv); // r.r
        *dv = spu_madd(tiv, *cv, tv);  // r.i
      }
      return 0;
    }
    case ZMA:
    {
      int length = p->length / 4;
      float *a_re = (float *)inout;
      float *a_im = a_re + 4 * length;
      float *b_re = a_re + 8 * length;
      float *b_im = a_re + 12 * length;
      float *c_re = a_re + 16 * length;
      float *c_im = a_re + 20 * length;
      unsigned int i;
      // a * b + c:
      // r.r = a.r*b.r + c.r - a.i*b.i
      // r.i = a.r*b.i + c.i + a.i*b.r
      for (i = 0; i != length;
	   ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4)
      {
	vector float *av = (vector float *)a_re;
	vector float *bv = (vector float *)b_re;
	vector float *cv = (vector float *)c_re;
	vector float *dv = (vector float *)a_im;
	vector float *ev = (vector float *)b_im;
	vector float *fv = (vector float *)c_im;
	vector float tmp = spu_nmsub(*dv, *ev, spu_madd(*av, *bv, *cv));
	*dv = spu_madd(*dv, *bv, spu_madd(*av, *ev, *fv));
	*av = tmp;
      }
      return 0;
    }
  }
  return 1;
}
Exemple #14
0
vector double
__divv2df3 (vector double a_in, vector double b_in)
{
    /* Variables */
    vec_int4    exp, exp_bias;
    vec_uint4   no_underflow, overflow;
    vec_float4  mant_bf, inv_bf;
    vec_ullong2 exp_a, exp_b;
    vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0;
    vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0;
    vec_ullong2 nan;
    vec_uint4   a_exp, b_exp;
    vec_ullong2 a_mant_0, b_mant_0;
    vec_ullong2 a_exp_1s, b_exp_1s;
    vec_ullong2 sign_exp_mask;

    vec_double2 a, b;
    vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;

    /* Constants */
    vec_uint4   exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
    vec_uchar16 splat_hi = (vec_uchar16) {
        0,1,2,3, 0,1,2,3,  8, 9,10,11, 8,9,10,11
    };
    vec_uchar16 swap_32 = (vec_uchar16) {
        4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
    };
    vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
    vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
    vec_float4  onef = spu_splats(1.0f);
    vec_double2 one = spu_splats(1.0);
    vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);

    sign_exp_mask = spu_or(sign_mask, exp_mask);

    /* Extract the floating point components from each of the operands including
     * exponent and mantissa.
     */
    a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
    a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
    b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
    b_exp = spu_shuffle(b_exp, b_exp, splat_hi);

    a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
    a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));

    b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
    b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));

    a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
    b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);

    /* Identify all possible special values that must be accommodated including:
     * +-denorm, +-0, +-infinity, and NaNs.
     */
    a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0);
    a_nan    = spu_andc(a_exp_1s, a_mant_0);
    a_zero   = spu_and (a_denorm0, a_mant_0);
    a_inf    = spu_and (a_exp_1s, a_mant_0);
    a_denorm = spu_andc(a_denorm0, a_zero);

    b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0);
    b_nan    = spu_andc(b_exp_1s, b_mant_0);
    b_zero   = spu_and (b_denorm0, b_mant_0);
    b_inf    = spu_and (b_exp_1s, b_mant_0);
    b_denorm = spu_andc(b_denorm0, b_zero);

    /* Scale denorm inputs to into normalized numbers by conditionally scaling the
     * input parameters.
     */
    a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
    a = spu_sel(a_in, a, a_denorm);

    b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
    b = spu_sel(b_in, b, b_denorm);

    /* Extract the divisor and dividend exponent and force parameters into the signed
     * range [1.0,2.0) or [-1.0,2.0).
     */
    exp_a = spu_and((vec_ullong2)a, exp_mask);
    exp_b = spu_and((vec_ullong2)b, exp_mask);

    mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
    mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);

    /* Approximate the single reciprocal of b by using
     * the single precision reciprocal estimate followed by one
     * single precision iteration of Newton-Raphson.
     */
    mant_bf = spu_roundtf(mant_b);
    inv_bf = spu_re(mant_bf);
    inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);

    /* Perform 2 more Newton-Raphson iterations in double precision. The
     * result (q1) is in the range (0.5, 2.0).
     */
    inv_b = spu_extend(inv_bf);
    inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
    q0 = spu_mul(mant_a, inv_b);
    q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);

    /* Determine the exponent correction factor that must be applied
     * to q1 by taking into account the exponent of the normalized inputs
     * and the scale factors that were applied to normalize them.
     */
    exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
    exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));

    /* Bias the quotient exponent depending on the sign of the exponent correction
     * factor so that a single multiplier will ensure the entire double precision
     * domain (including denorms) can be achieved.
     *
     *    exp 	       bias q1     adjust exp
     *   =====	       ========    ==========
     *   positive         2^+65         -65
     *   negative         2^-64         +64
     */
    exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
    exp = spu_sub(exp, exp_bias);

    q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);

    /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
     * expected result. On overflow, clamp the multiplier to the maximum non-infinite
     * number in case the rounding mode is not round-to-nearest.
     */
    exp = spu_add(exp, 0x3FF);
    no_underflow = spu_cmpgt(exp, 0);
    overflow = spu_cmpgt(exp, 0x7FE);
    exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
    exp = spu_and(exp, (vec_int4)exp_mask);

    mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow);

    /* Handle special value conditions. These include:
     *
     * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
     *    results.
     * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
     * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
     */
    mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
    mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));

    nan = spu_or(a_nan, b_nan);
    nan = spu_or(nan, spu_and(a_zero, b_zero));
    nan = spu_or(nan, spu_and(a_inf, b_inf));

    mult = spu_or(mult, (vec_double2)nan);

    /* Scale the final quotient */

    q2 = spu_mul(q1, mult);

    return (q2);
}
void* libvector_pointwise_multiply_32fc_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
	//loop iterator i
	int i = 0;
	void* retval = target;


	//put the target and source addresses into qwords
	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};

	//create shuffle masks

	//shuffle mask building blocks:
	//all from the first vector
	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
	//all from the second vector
	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};



	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
												 (vector unsigned int)oneup);
	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);

	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	sixteen_uchar = spu_splats((unsigned char)16);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
												 (vector unsigned int)oneup);
	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);





	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));

	//alpha: first half of first, second half of second, break at (unsigned int)target%16
	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
													 (vector unsigned int)oneup);

	//delta: first half of first, first half of second, break at (unsigned int)target%16
	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);

	//beta: first half of first, second half of second, break at num_bytes%16
	src_cmp = spu_splats((unsigned char)(num_bytes%16));
	gt_res = spu_cmpgt(oneup, src_cmp);
	eq_res = spu_cmpeq(oneup, src_cmp);
	cmp_res = spu_or(gt_res, eq_res);
	phase_change = spu_and(sixteen_uchar, cmp_res);
	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
													 (vector unsigned int)oneup);






	qword src0_past;
	qword src0_present;
	qword src1_past;
	qword src1_present;
	qword tgt_past;
	qword tgt_present;

	qword in_temp0;
	qword in_temp1;
	qword out_temp0;
	qword out_temp1;


	src0_past = si_lqd((qword)address_counter_src0, 0);
	src1_past = si_lqd((qword)address_counter_src1, 0);
	tgt_past = si_lqd((qword)address_counter_tgt, 0);

	vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
													  0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
	vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
													  0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b};
	vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
													  0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
	vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
										 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};

	vector float prod0;
	qword shuf0;
	vector float prod1;
	vector float sign_change;
	qword summand0;
	qword summand1;
	vector float sum;


	for(i = 0; i < num_bytes/16; ++i) {

		src0_present = si_lqd((qword)address_counter_src0, 16);
		src1_present = si_lqd((qword)address_counter_src1, 16);
		tgt_present = si_lqd((qword)address_counter_tgt, 16);

		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);

		prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
		shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
		prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
		sign_change = spu_xor(prod0, (vector float)sign_changer);

		summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);

		summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);

		sum = spu_add((vector float)summand0, (vector float)summand1);


		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);

		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
		si_stqd(out_temp1, (qword)address_counter_tgt, 16);

		tgt_past = out_temp1;
		src0_past = src0_present;
		src1_past = src1_present;
		address_counter_src0 = spu_add(address_counter_src0, 16);
		address_counter_src1 = spu_add(address_counter_src1, 16);
		address_counter_tgt = spu_add(address_counter_tgt, 16);


	}

	src0_present = si_lqd((qword)address_counter_src0, 16);
	src1_present = si_lqd((qword)address_counter_src1, 16);
	tgt_present = si_lqd((qword)address_counter_tgt, 16);


	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);


	prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
	shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
	prod1 = spu_mul(prod0, (vector float)shuf0);
	sign_change = spu_xor(prod0, (vector float)sign_changer);
	summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
	summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
	sum = spu_add((vector float)summand0, (vector float)summand1);



	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);



	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);

	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
	si_stqd(out_temp1, (qword)address_counter_tgt, 16);

	return retval;
}
Exemple #16
0
void merge_buffers(){
  vector unsigned int cmp_v, cmp_v2;

  const vector signed int one_at_0 = {1,0,0,0};
  const vector signed int one_at_1 = {0,1,0,0};
  const vector signed int one_at_2 = {0,0,1,0};
  const vector signed int ones = {1,1,1,1};
  const vector signed int zeros = {0,0,0,0};

  const vector unsigned char cmp_v_shuffle_mask = {31,31,31,31,
						   31,31,31,31,
						   31,31,31,31,
						   31,31,31,31};
  vector unsigned char rev_mask;
  const vector unsigned char rev_left = {12,13,14,15,
					 8,9,10,11,
					 4,5,6,7,
					 0,1,2,3};

  const vector unsigned char rev_right = {28,29,30,31,
					  24,25,26,27,
					  20,21,22,23,
					  16,17,18,19};
  vector signed int *out_head_idx;
  if(mcb[am].local[OUT] < 255){
    int parent_idx = mcb[am].local[OUT];
    int side = (mcb[am].id+1)&1;
    out_head_idx = (vector signed int*) &md[parent_idx].idx[side][HEAD];
  } else {
    out_head_idx = (vector signed int*) &md[am].idx[OUT][HEAD];
  }

  vector signed int *left_tail_idx = (vector signed int*) &md[am].idx[LEFT][TAIL];
  vector signed int *right_tail_idx = (vector signed int*) &md[am].idx[RIGHT][TAIL];

  vector signed int size_v = {mcb[am].buffer_size[LEFT], mcb[am].buffer_size[RIGHT], mcb[am].buffer_size[OUT], 0};
  vector signed int avail_v = {num_in_buffer(LEFT), num_in_buffer(RIGHT), num_free_in_buffer(OUT), 1};
  vector signed int avail_before = { spu_extract(avail_v, 0), spu_extract(avail_v, 1), 0, 0 };
  vector unsigned int avail = spu_gather( spu_cmpgt(avail_v, zeros) ); // avail = 0x0F if all avail_v > zeros

  vector signed int *left, *right, *out;
  left = (vector signed int*) &md[am].buffer[LEFT][ spu_extract(*left_tail_idx,0) ];
  right = (vector signed int*) &md[am].buffer[RIGHT][ spu_extract(*right_tail_idx,0) ];
  out = (vector signed int*) &md[am].buffer[OUT][ spu_extract(*out_head_idx,0) ];

  #ifdef TRACE_TIME
    dec_val2 = spu_read_decrementer();
  #endif

  while(spu_extract(avail,0) == 0x0F){
    // cmp left and right to determine who gets eaten
    cmp_v = spu_cmpgt(*left,*right);
    cmp_v = spu_shuffle(cmp_v, cmp_v, cmp_v_shuffle_mask);
    // cmp_v = {FFFF,FFFF,FFFF,FFFF} if left[3] > right[3]

    *out = spu_sel(*left,*right,cmp_v);
    rev_mask = spu_sel(rev_right,rev_left,(vector unsigned char)cmp_v);
    *left = spu_shuffle(*left,*right,rev_mask);
    // data to be sorted is now in out and left, left in descending order

    sort_vectors(out,left);

    // update index of the used side
    if( spu_extract(cmp_v,0) ){
      // left[3] > right[3]
      *right_tail_idx = spu_add(*right_tail_idx,ones);
      avail_v = spu_sub(avail_v, one_at_1);
      right++;
      // modulus hack
      cmp_v2 = spu_cmpeq(*right_tail_idx, size_v);
      if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){
	*right_tail_idx = zeros;
	right = (vector signed int*) &md[am].buffer[RIGHT][0];
      }
    } else {
      *right = *left;
      *left_tail_idx = spu_add(*left_tail_idx,ones);
      avail_v = spu_sub(avail_v, one_at_0);
      left++;
      // modulus hack
      cmp_v2 = spu_cmpeq(*left_tail_idx, size_v);      
      if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){	
	*left_tail_idx = zeros;
	left = (vector signed int*) &md[am].buffer[LEFT][0];
      }
    }

    // update out head idx
    *out_head_idx = spu_add(*out_head_idx,ones);
    avail_v = spu_sub(avail_v, one_at_2);
    out++;
    // modulus hack
    cmp_v2 = spu_cmpeq(*out_head_idx, size_v);
    if( __builtin_expect(spu_extract(cmp_v2,0),0) ){
      out = (vector signed int*) &md[am].buffer[OUT][0];
      *out_head_idx = zeros;
    }

    // is there data still available?
    avail = spu_gather(spu_cmpgt(avail_v, zeros));
  }

  #ifdef TRACE_TIME
  merge_loop_ticks += -(spu_read_decrementer() - dec_val2);
  #endif

  // how much got produced?
  vector signed int consumed = spu_sub(avail_before, avail_v);
  int consumed_left = spu_extract(consumed, 0);
  int consumed_right = spu_extract(consumed, 1);

  if(consumed_left)
    update_tail(LEFT);

  if(consumed_right)
    update_tail(RIGHT);

  md[am].consumed[LEFT] += consumed_left;
  md[am].consumed[RIGHT] += consumed_right;
    
  if(md[am].consumed[LEFT] == mcb[am].data_size[LEFT])
    md[am].depleted[LEFT] = 1;
  
  if(md[am].consumed[RIGHT] == mcb[am].data_size[RIGHT])
    md[am].depleted[RIGHT] = 1;

  if(mcb[am].local[OUT] < 255 && md[am].depleted[LEFT] && md[am].depleted[RIGHT]){
    md[am].done = 1;
    --num_active_mergers;
  }
}
Exemple #17
0
/* Scans the string pointed to by s for the character c and
 * returns a pointer to the last occurance of c. If
 * c is not found, then NULL is returned.
 */
char * strrchr(const char *s, int c)
{
  int nskip;
  vec_uchar16 *ptr, data, vc;
  vec_uint4 cmp_c, cmp_0, cmp;
  vec_uint4 res_ptr, res_cmp;
  vec_uint4 mask, result;
  vec_uint4 one = spu_splats(0xffffU);
  /* Scan memory array a quadword at a time. Skip leading
   * mis-aligned bytes.
   */
  ptr = (vec_uchar16 *)s;

  nskip = -((unsigned int)(ptr) & 15);
  mask = spu_rlmask(one, nskip);

  vc = spu_splats((unsigned char)(c));

  data = *ptr++;
  ptr = (vec_uchar16 *)((unsigned int)ptr & ~15);

  cmp_c = spu_and(spu_gather(spu_cmpeq(data, vc)), mask);
  cmp_0 = spu_and(spu_gather(spu_cmpeq(data, 0)), mask);

  res_ptr = spu_splats(0U);
  res_cmp = spu_splats(0U);

  while (spu_extract(cmp_0, 0) == 0) {
    cmp = spu_cmpeq(cmp_c, 0);

    res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
    res_cmp = spu_sel(cmp_c, res_cmp, cmp);

    data = *ptr++;

    cmp_c = spu_gather(spu_cmpeq(data, vc));
    cmp_0 = spu_gather(spu_cmpeq(data, 0));

    cmp = spu_cmpeq(cmp_c, 0);
  }

  /* Compute the location of the last character before termination
   * character.
   *
   * First mask off compare results following the first termination character.
   */
  mask = spu_sl(one, 31 - spu_extract(spu_cntlz(cmp_0), 0));
  cmp_c = spu_and(cmp_c, mask);

  /* Conditionally update res_ptr and res_cmd if a match was found in the last
   * quadword.
   */
  cmp = spu_cmpeq(cmp_c, 0);

  res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
  res_cmp = spu_sel(cmp_c, res_cmp, cmp);

  /* Bit reserve res_cmp for locating last occurance.
   */
  mask = spu_cmpeq(res_cmp, 0);

  res_cmp = (vec_uint4)spu_maskb(spu_extract(res_cmp, 0));
  res_cmp = spu_gather((vec_uchar16)spu_shuffle(res_cmp, res_cmp,
						VEC_LITERAL(vec_uchar16,
							    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)));

  /* Compute the location (ptr) of the last occurance of c. If no
   * occurance was found (ie, element 0 of res_cmp == 0, then return
   * NULL.
   */
  result = spu_sub(spu_add(res_ptr, 15), spu_cntlz(res_cmp));
  result = spu_andc(result, mask);

  return ((char *)spu_extract(result, 0));
}
Exemple #18
0
Triangle* getTriangleBuffer(Context* context)
{
	// if we've already allocated a triangle buffer (and we're in the same context)
	if (context == _currentTriangleContext && _currentTriangle)
		return _currentTriangle;

	// trash the default values
	_currentTriangleContext	= context;
	_currentTriangle	= NULL;

	// read the current renderable cache line to ensure there is room for the triangle data
	// in the cache line buffer; we do this by comparing against all 16 cache line blocks
	// to make sure that extending the write pointer wouldn't clobber the data

	unsigned long long cache_ea = context->renderableCacheLine;
	if (cache_ea == 0)
		return NULL;
	char cachebuffer[128+127];
	RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 );

	// printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea));

	spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
	spu_readch(MFC_RdAtomicStat);

	// extendvalid = ( read<=write && test<end ) || ( read>write && test<read )
	// extendvalid = ( read>write && read>test ) || ( read<=write && end>test )
	// simplifies to	extendvalid = selb(end, read, read>write) > test
	// or			extendvalid = selb(end>test, read>test, read>write)
	// rewind = next >= end
	// rewindvalid = read != 0
	// valid = extendvalid && (!rewind || rewindvalid)
	// 	 = extendvalid && (!rewind || !rewindinvalid)
	// 	 = extendvalid && !(rewind && rewindinvalid)
	// invalid = ! (extendvalid && !(rewind && rewindinvalid))
	//         = (!extendvalid || (rewind && rewindinvalid))

	vec_ushort8 v_writeptr		= spu_splats( cache->endTriangle );
	vec_ushort8 v_readptr0		= cache->chunkTriangle[0];
	vec_ushort8 v_readptr1		= cache->chunkTriangle[1];
	vec_ushort8 v_testptr		= spu_add(v_writeptr,   TRIANGLE_MAX_SIZE);
	vec_ushort8 v_nextptr		= spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE);
	vec_ushort8 v_endptr		= spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE);

	vec_ushort8 v_zero		= spu_splats( (unsigned short) 0 );
	vec_uchar16 v_merger		= (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 };

	vec_ushort8 v_max0_test		= spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) );
	vec_ushort8 v_max1_test		= spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) );
	vec_ushort8 v_extend0_valid	= spu_cmpgt( v_max0_test, v_testptr );
	vec_ushort8 v_extend1_valid	= spu_cmpgt( v_max1_test, v_testptr );
	vec_ushort8 v_rewind0_invalid	= spu_cmpeq( v_readptr0, v_zero );
	vec_ushort8 v_rewind1_invalid	= spu_cmpeq( v_readptr1, v_zero );
	vec_ushort8 v_rewind8		= spu_cmpgt( v_nextptr, v_endptr );

	vec_uchar16 v_extend_valid	= (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger );
	vec_uchar16 v_rewind_invalid	= (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger );
	vec_uchar16 v_rewind		= (vec_uchar16) v_rewind8;

	vec_uchar16 v_valid_rhs		= spu_and( v_rewind_invalid, v_rewind );
	vec_uchar16 v_invalid		= spu_orc( v_valid_rhs, v_extend_valid );

	// check to see if the chunk is being processed
	vec_uint4 v_free = spu_gather(
		spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );
	vec_uint4   v_invalid_bits	= spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free );

	// if any of the bits are invalid, then no can do
	if ( spu_extract(v_invalid_bits, 0) ) {
		return NULL;
	}

	// fetch in the data before this triangle in the cache buffer
	unsigned int offset = cache->endTriangle;
	_currentTriangleBufferExtra = offset & 127;
	unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127);
	if (_currentTriangleBufferExtra) {
		spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD);

		// ensure DMA did actually complete
		mfc_write_tag_mask(1<<0);
		mfc_read_tag_status_all();
	}

	// final bit of initialisation
	_currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra);
	_currentTriangleOffset = offset;
	_currentTriangleRewind = v_rewind8;
	_currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache));
	_currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); 
	_currentTriangleBufferEA = trianglebuffer_ea; 

	// printf("Allocated new triangle buffer: %x\n", offset);

	// and return the buffer ready to go
	return _currentTriangle;
}