Esempio n. 1
0
void KDBuildJob::ResetMinMaxBin(minmaxbin_t *mmb, int nbins, int index)
{
	if(nbins != numbins[index])
	{
		if(numbins[index] > 0)
		{
			#ifdef _CELL
				_free_align(minbins[index]);
				_free_align(maxbins[index]);
			#else
				delete [] minbins[index];
				delete [] maxbins[index];
			#endif
		}

		numbins[index] = nbins;

		#ifdef _CELL
			minbins[index] = (bin_t*)_malloc_align(numbins[index] * sizeof(bin_t), 7);
			maxbins[index] = (bin_t*)_malloc_align(numbins[index] * sizeof(bin_t), 7);			
		#else
			minbins[index] = new bin_t[numbins[index]];
			maxbins[index] = new bin_t[numbins[index]];
		#endif
	}

	int i;

	#ifdef _CELL
	vector float zero = spu_splats(0.0f);
	vector float *vminbins = (vector float*)minbins[index];
	vector float *vmaxbins = (vector float*)maxbins[index];

	for(i=0; i < numbins[index]; i++)
	{
		vminbins[i] = zero;
		vmaxbins[i] = zero;		
	}

	#else
	for(i=0; i < numbins[index]; i++)
	{
		minbins[index][i].b[0] = maxbins[index][i].b[0] = 0;
		minbins[index][i].b[1] = maxbins[index][i].b[1] = 0;
		minbins[index][i].b[2] = maxbins[index][i].b[2] = 0;
	}
	#endif

	mmb->minbins = minbins[index];
	mmb->maxbins = maxbins[index];


	mmb->numbins = numbins[index];
	mmb->bestcost = 1000000;
}
Esempio n. 2
0
vec_ullong2 cmpnegzerod2( vec_double2 x )
{
   vec_ullong2 cmp;
   vec_uchar16 even = (vec_uchar16)(vec_uint4){ 0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b };
   vec_uchar16 odd = (vec_uchar16)(vec_uint4){ 0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f };
 
   cmp = (vec_ullong2)spu_cmpeq( (vec_int4)x, (vec_int4)spu_splats(0x8000000000000000ull) );
   cmp = spu_and( spu_shuffle( cmp, cmp, even ), spu_shuffle( cmp, cmp, odd ) );
 
   return cmp;
}
Esempio n. 3
0
inline vector int GetBinSIMD(vector float left, vector float right, vector float pos, vector float invdelta, vector float nbins)
{
	pos = spu_min(pos, right);
	pos = spu_max(pos, left);
	
	vector float bin = spu_mul(spu_abs(spu_sub(pos, left)), invdelta);

	bin = spu_min(bin, nbins);
	bin = spu_max(spu_splats(0.0f), bin);

	return spu_convts(bin, 0);
}
Esempio n. 4
0
void cp_buffer(int side){
  int avail_out = num_free_in_buffer(OUT);
  int avail_side = num_in_buffer(side);
  int max = avail_out < avail_side ? avail_out : avail_side;

  vector signed int *out_head;
  if(mcb[am].local[OUT] < 255)
    out_head = (vector signed int*) &md[ mcb[am].local[OUT] ].idx[ (mcb[am].id+1)&1 ][HEAD];
  else
    out_head = (vector signed int*) &md[am].idx[OUT][HEAD];

  vector unsigned int cmp_v;
  vector signed int from_size = spu_splats( mcb[am].buffer_size[side] );
  vector signed int out_size = spu_splats( mcb[ mcb[am].local[OUT] ].buffer_size[ (mcb[am].id+1)&1 ] );
  vector signed int ones = {1,1,1,1};
  vector signed int zeros = {0,0,0,0};

  int i;
  for(i = 0; i < max; i++){
    md[am].buffer[OUT][spu_extract( *out_head,0)] = md[am].buffer[side][spu_extract(md[am].idx[side][TAIL],0)];
    // update idx
    md[am].idx[side][TAIL] = spu_add(md[am].idx[side][TAIL], ones);
    cmp_v = spu_cmpeq(md[am].idx[side][TAIL],from_size);
    md[am].idx[side][TAIL] = spu_sel(md[am].idx[side][TAIL], zeros, cmp_v);

    *out_head = spu_add(*out_head,ones);
    cmp_v = spu_cmpeq(*out_head, out_size);
    *out_head = spu_sel(*out_head,zeros,cmp_v);
  }

  update_tail(side);

  md[am].consumed[side] += max;

  if(mcb[am].local[OUT] < 255 && md[am].consumed[side] == mcb[am].data_size[side]){
    md[am].depleted[side] = 1;
    md[am].done = 1;
    --num_active_mergers;
  }
}
Esempio n. 5
0
int main()
{
    TEST_SET_START("20040930102649EJL","EJL", "negatef4");

    unsigned int i3n = 0xffffffff;
    unsigned int i3p = 0x7fffffff;

    float x0n = hide_float(-0.0f);
    float x0p = hide_float(0.0f);
    float x1n = hide_float(-83532.96153153f);
    float x1p = hide_float(83532.96153153f);
    float x2n = hide_float(-0.0000000013152f);
    float x2p = hide_float(0.0000000013152f);
    float x3n = hide_float(make_float(i3n));
    float x3p = hide_float(make_float(i3p));

    vec_float4 x0n_v = spu_splats(x0n);
    vec_float4 x0p_v = spu_splats(x0p);
    vec_float4 x1n_v = spu_splats(x1n);
    vec_float4 x1p_v = spu_splats(x1p);
    vec_float4 x2n_v = spu_splats(x2n);
    vec_float4 x2p_v = spu_splats(x2p);
    vec_float4 x3n_v = spu_splats(x3n);
    vec_float4 x3p_v = spu_splats(x3p);

    vec_float4 res_v;

    TEST_START("negatef4");
    res_v = negatef4(x0n_v);
    TEST_CHECK("20040930102652EJL", allequal_float4( res_v, x0p_v ), 0);
    res_v = negatef4(x0p_v);
    TEST_CHECK("20040930102653EJL", allequal_float4( res_v, x0n_v ), 0);
    res_v = negatef4(x1n_v);
    TEST_CHECK("20040930102655EJL", allequal_float4( res_v, x1p_v ), 0);
    res_v = negatef4(x1p_v);
    TEST_CHECK("20040930102657EJL", allequal_float4( res_v, x1n_v ), 0);
    res_v = negatef4(x2n_v);
    TEST_CHECK("20040930102659EJL", allequal_float4( res_v, x2p_v ), 0);
    res_v = negatef4(x2p_v);
    TEST_CHECK("20040930102701EJL", allequal_float4( res_v, x2n_v ), 0);
    res_v = negatef4(x3n_v);
    TEST_CHECK("20040930102703EJL", allequal_float4( res_v, x3p_v ), 0);
    res_v = negatef4(x3p_v);
    TEST_CHECK("20040930102705EJL", allequal_float4( res_v, x3n_v ), 0);

    TEST_SET_DONE();

    TEST_EXIT();
}
Esempio n. 6
0
void triad()
{
	int i, j, n;
	
	vector float s = spu_splats(args.scalar);
	
	n = SIZE * sizeof(float);

	for (i = 0; (i + SIZE) < args.N; i += SIZE) {
		mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();

		for (j = 0; j < (SIZE / 4); ++j)
			ls3[j] = spu_madd(s, ls2[j], ls1[j]);

		mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
	}
		
	mfc_write_tag_mask(1 << TAG);
	mfc_read_tag_status_all();

	if (unlikely(i < args.N)) {
		/* 
		 * args.N - i will be smaller than SIZE at this point so
		 * it is safe to do a DMA transfer.
		 * We need to make sure that size is a multiple of 16.
		 */
		n = ((args.N - i) * sizeof(float)) & (~127);

		mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
		
		/* n must be divisible by 4. */
		for (j = 0; j < ((args.N - i) / 4); ++j)
			ls3[j] = spu_madd(s, ls2[j], ls1[j]);
		
		mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
	}

	/* 
	 * At this point it may be that i is still smaller than args.N if the length
	 * was not divisible by the number of SPUs times 16.
	 */
}
Esempio n. 7
0
/**
 * Applies transport processes to x data
 */
void transport_buffer(uint32_t i, real_t size, real_t dt)
{
    const uint32_t length = conc[i].length;

    /* Reinterpret data as vectors */
    vector real_t* vconc = (vector real_t*)conc[i].data;
    vector real_t* vwind = (vector real_t*)wind[i].data;
    vector real_t* vdiff = (vector real_t*)diff[i].data;
    vector real_t* vbuff = (vector real_t*)buff[i].data;

    /* Splat scalars to vectors */
    vector real_t vsize = spu_splats(size);
    vector real_t vdt = spu_splats(dt);

    /* Wait for input buffer */
    wait_for_dma(i);

    /* Prepare boundary values */
    create_shift_boundary(i, vconc, vwind, vdiff, length);

    /* Transport in buffer */
    discretize(length, vconc, vwind, vdiff,
               cbound[i], wbound[i], dbound[i], vsize, vdt, vbuff);
}
Esempio n. 8
0
int main()
{
   TEST_SET_START("20040930102626EJL","EJL", "negated2");
   
   double x0n = hide_double(-0.0);
   double x0p = hide_double(0.0);
   double x1n = hide_double(-83532.96153153);
   double x1p = hide_double(83532.96153153);
   double x2n = hide_double(-0.0000000013152);
   double x2p = hide_double(0.0000000013152);
   double x3n = hide_double(-HUGE_VAL);
   double x3p = hide_double(HUGE_VAL);
   
   vec_double2 x0n_v = spu_splats(x0n);
   vec_double2 x0p_v = spu_splats(x0p);
   vec_double2 x1n_v = spu_splats(x1n);
   vec_double2 x1p_v = spu_splats(x1p);
   vec_double2 x2n_v = spu_splats(x2n);
   vec_double2 x2p_v = spu_splats(x2p);
   vec_double2 x3n_v = spu_splats(x3n);
   vec_double2 x3p_v = spu_splats(x3p);
   
   vec_double2 res_v;

   TEST_START("negated2");
   res_v = negated2(x0n_v);
   TEST_CHECK("20040930102629EJL", allequal_double2( res_v, x0p_v ), 0);
   res_v = negated2(x0p_v);
   TEST_CHECK("20040930102631EJL", allequal_double2( res_v, x0n_v ), 0);
   res_v = negated2(x1n_v);
   TEST_CHECK("20040930102632EJL", allequal_double2( res_v, x1p_v ), 0);
   res_v = negated2(x1p_v);
   TEST_CHECK("20040930102635EJL", allequal_double2( res_v, x1n_v ), 0);
   res_v = negated2(x2n_v);
   TEST_CHECK("20040930102637EJL", allequal_double2( res_v, x2p_v ), 0);
   res_v = negated2(x2p_v);
   TEST_CHECK("20040930102639EJL", allequal_double2( res_v, x2n_v ), 0);
   res_v = negated2(x3n_v);
   TEST_CHECK("20040930102641EJL", allposinf_double2( res_v ), 0);
   res_v = negated2(x3p_v);
   TEST_CHECK("20040930102643EJL", allneginf_double2( res_v ), 0);
   
   TEST_SET_DONE();
   
   TEST_EXIT();
}
Esempio n. 9
0
int main()
{
   TEST_SET_START("20040930102649EJL","EJL", "negatei4");
   
   int x0n = hide_int(0);
   int x0p = hide_int(0);
   int x1n = hide_int(-83532);
   int x1p = hide_int(83532);
   int x2n = hide_int(-13152);
   int x2p = hide_int(13152);
   int x3n = hide_int(-1);
   int x3p = hide_int(1);
   
   vec_int4 x0n_v = spu_splats(x0n);
   vec_int4 x0p_v = spu_splats(x0p);
   vec_int4 x1n_v = spu_splats(x1n);
   vec_int4 x1p_v = spu_splats(x1p);
   vec_int4 x2n_v = spu_splats(x2n);
   vec_int4 x2p_v = spu_splats(x2p);
   vec_int4 x3n_v = spu_splats(x3n);
   vec_int4 x3p_v = spu_splats(x3p);
   
   vec_int4 res_v;

   TEST_START("negatei4");
   res_v = negatei4(x0n_v);
   TEST_CHECK("20040930102652EJL", allequal_int4( res_v, x0p_v ), 0);
   res_v = negatei4(x0p_v);
   TEST_CHECK("20040930102653EJL", allequal_int4( res_v, x0n_v ), 0);
   res_v = negatei4(x1n_v);
   TEST_CHECK("20040930102655EJL", allequal_int4( res_v, x1p_v ), 0);
   res_v = negatei4(x1p_v);
   TEST_CHECK("20040930102657EJL", allequal_int4( res_v, x1n_v ), 0);
   res_v = negatei4(x2n_v);
   TEST_CHECK("20040930102659EJL", allequal_int4( res_v, x2p_v ), 0);
   res_v = negatei4(x2p_v);
   TEST_CHECK("20040930102701EJL", allequal_int4( res_v, x2n_v ), 0);
   res_v = negatei4(x3n_v);
   TEST_CHECK("20040930102703EJL", allequal_int4( res_v, x3p_v ), 0);
   res_v = negatei4(x3p_v);
   TEST_CHECK("20040930102705EJL", allequal_int4( res_v, x3n_v ), 0);
   
   TEST_SET_DONE();
   
   TEST_EXIT();
}
Esempio n. 10
0
void SetBasisEtc(i32 c_orig, i32 r_orig)
{
  if (lod & 1)
  {
    step      = odd_step;
    inv_step  = odd_inv_step;
    basis_col = odd_basis_col;
    basis_row = odd_basis_row;
  }
  else
  {
    step      = even_step;
    inv_step  = even_inv_step;
    basis_col = even_basis_col;
    basis_row = even_basis_row;
  }

  origin_world += spu_splats((f32)c_orig)*dvc_world + spu_splats((f32)r_orig)*dvr_world;

  if (lod & 1)
  {
    f32 s = 1.0f * even_step;
    dvc_world = (vf32){ s,0,s,0};
    dvr_world = (vf32){-s,0,s,0};
  }
  else
  {
    f32 s = even_step;
    dvc_world = (vf32){s,0,0,0};
    dvr_world = (vf32){0,0,s,0};
  }

  origin_camera = MatMulVec((const mtx4 &)g_pViewData->m_world_to_camera_matrix, origin_world);
  dvc_camera    = MatMulVec((const mtx4 &)g_pViewData->m_world_to_camera_matrix, dvc_world);
  dvr_camera    = MatMulVec((const mtx4 &)g_pViewData->m_world_to_camera_matrix, dvr_world);
}
Esempio n. 11
0
void _compute( unsigned int bs, unsigned int k, unsigned int row, vector float *BKJ, float *B, float *A )
{
	vector float *BIJ, aik;
	unsigned int i;

	for( ; k < bs ; k++ )
	{
		aik = spu_splats( A[k*bs+row] );

		BIJ = (vector float*)( B + ( k * bs ) );
		// -----> For each vector in row
		for( i = 0 ; i < bs / 4 ; i++ )
		{
			// BIJ[i] = BIJ[i] - BKJ[i] * aik;
			BIJ[i] = spu_sub( BIJ[i], spu_mul( BKJ[i], aik ) );
		}
	}
}
Esempio n. 12
0
void *
sbrk (ptrdiff_t increment)
{
	static caddr_t heap_ptr = NULL;
	caddr_t base;
	vector unsigned int sp_reg, sp_delta;
	vector unsigned int *sp_ptr;
	caddr_t sps;

	/* The stack pointer register.  */
	volatile register vector unsigned int sp_r1 __asm__("1");
	
	if (heap_ptr == NULL)
	  heap_ptr = (caddr_t) & _end;
	
	sps = (caddr_t) spu_extract (sp_r1, 0);
	if (((int) sps - STACKSIZE - (int) heap_ptr) >= increment)
	  {
	    base = heap_ptr;
	    heap_ptr += increment;
	    
	    sp_delta = (vector unsigned int) spu_insert (increment, spu_splats (0), 1);

	    /* Subtract sp_delta from the SP limit (word 1).  */
	    sp_r1 = spu_sub (sp_r1, sp_delta);
	    
	    /* Fix-up backchain.  */
	    sp_ptr = (vector unsigned int *) spu_extract (sp_r1, 0);
	    do
	      {
		sp_reg = *sp_ptr;
		*sp_ptr = (vector unsigned int) spu_sub (sp_reg, sp_delta);
	      }
	    while ((sp_ptr = (vector unsigned int *) spu_extract (sp_reg, 0)));

	    return (base);
	  }
	else
	  {
	    errno = ENOMEM;
	    return ((void *) -1);
	  }
}
Esempio n. 13
0
void ResetMinMaxBin(minmaxbin_t *mmb, int nbins, int index)
{
	numbins[index] = nbins;

	int i;
	vector float zero = spu_splats(0.0f);
	vector float *vminbins = (vector float*)minbins[index];
	vector float *vmaxbins = (vector float*)maxbins[index];

	for(i=0; i < numbins[index]; i++)
	{
		vminbins[i] = zero;
		vmaxbins[i] = zero;		
	}

	mmb->minbins = minbins[index];
	mmb->maxbins = maxbins[index];

	mmb->numbins = numbins[index];
	mmb->bestcost = 1000000.f;
}
Esempio n. 14
0
void _compute2( unsigned int bs, unsigned int k, unsigned int row, vector float *BKJ, float *B, float *A, Functions_t *funcs  )
{
	vector float *BIJ, aik;
	unsigned int i;

	for( ; k < bs ; k++ )
	{
		aik = spu_splats( A[k*bs+row] );

		BIJ = (vector float*)( B + ( k * bs ) );
		funcs->printuint( 900000 + k );
		// -----> For each vector in row
		for( i = 0 ; i < bs / 4 ; i++ )
		{
			funcs->printfloatv( &aik );
			funcs->printfloatv( &BIJ[i] );
			funcs->printfloatv( &BKJ[i] );
			// BIJ[i] = BIJ[i] - BKJ[i] * aik;
			BIJ[i] = spu_sub( BIJ[i], spu_mul( BKJ[i], aik ) );
			funcs->printfloatv( &BIJ[i] );
		}
	}
}
Esempio n. 15
0
/**
 * Setup fragment shader inputs by evaluating triangle's vertex
 * attribute coefficient info.
 * \param x  quad x pos
 * \param y  quad y pos
 * \param fragZ  returns quad Z values
 * \param fragInputs  returns fragment program inputs
 * Note: this code could be incorporated into the fragment program
 * itself to avoid the loop and switch.
 */
static void
eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
{
   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
   static const vector float deltaY = (const vector float) {0, 0, 1, 1};

   const uint posSlot = 0;
   const vector float pos = setup.coef[posSlot].a0;
   const vector float dposdx = setup.coef[posSlot].dadx;
   const vector float dposdy = setup.coef[posSlot].dady;
   const vector float fragX = spu_splats(x) + deltaX;
   const vector float fragY = spu_splats(y) + deltaY;
   vector float fragW, wInv;
   uint i;

   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
   wInv = spu_re(fragW);  /* 1 / w */

   /* loop over fragment program inputs */
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      uint attr = i + 1;
      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;

      /* constant term */
      vector float a0 = setup.coef[attr].a0;
      vector float r0 = splatx(a0);
      vector float r1 = splaty(a0);
      vector float r2 = splatz(a0);
      vector float r3 = splatw(a0);

      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
         /* linear term */
         vector float dadx = setup.coef[attr].dadx;
         vector float dady = setup.coef[attr].dady;
         /* Use SPU intrinsics here to get slightly better code.
          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
          */
         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
         if (interp == INTERP_PERSPECTIVE) {
            /* perspective term */
            r0 *= wInv;
            r1 *= wInv;
            r2 *= wInv;
            r3 *= wInv;
         }
      }
      fragInputs[CHAN0] = r0;
      fragInputs[CHAN1] = r1;
      fragInputs[CHAN2] = r2;
      fragInputs[CHAN3] = r3;
      fragInputs += 4;
   }
}


/**
 * Emit a quad (pass to next stage).  No clipping is done.
 * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 * should be skipped.  But adding the test for that slows things down
 * overall.
 */
static INLINE void
emit_quad( int x, int y, mask_t mask)
{
   /* If any bits in mask are set... */
   if (spu_extract(spu_orx(mask), 0)) {
      const int ix = x - setup.cliprect_minx;
      const int iy = y - setup.cliprect_miny;

      spu.cur_ctile_status = TILE_STATUS_DIRTY;
      spu.cur_ztile_status = TILE_STATUS_DIRTY;

      {
         /*
          * Run fragment shader, execute per-fragment ops, update fb/tile.
          */
         vector float inputs[4*4], outputs[2*4];
         vector unsigned int kill_mask;
         vector float fragZ;

         eval_inputs((float) x, (float) y, &fragZ, inputs);

         ASSERT(spu.fragment_program);
         ASSERT(spu.fragment_ops);

         /* Execute the current fragment program */
         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);

         mask = spu_andc(mask, kill_mask);

         /* Execute per-fragment/quad operations, including:
          * alpha test, z test, stencil test, blend and framebuffer writing.
          * Note that there are two different fragment operations functions
          * that can be called, one for front-facing fragments, and one
          * for back-facing fragments.  (Often the two are the same;
          * but in some cases, like two-sided stenciling, they can be
          * very different.)  So choose the correct function depending
          * on the calculated facing.
          */
         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                          fragZ,
                          outputs[0*4+0],
                          outputs[0*4+1],
                          outputs[0*4+2],
                          outputs[0*4+3],
                          mask);
      }
   }
}


/**
 * Given an X or Y coordinate, return the block/quad coordinate that it
 * belongs to.
 */
static INLINE int
block(int x)
{
   return x & ~1;
}


/**
 * Render a horizontal span of quads
 */
static void
flush_spans(void)
{
   int minleft, maxright;

   const int l0 = spu_extract(setup.span.quad, 0);
   const int l1 = spu_extract(setup.span.quad, 1);
   const int r0 = spu_extract(setup.span.quad, 2);
   const int r1 = spu_extract(setup.span.quad, 3);

   switch (setup.span.y_flags) {
   case 0x3:
      /* both odd and even lines written (both quad rows) */
      minleft = MIN2(l0, l1);
      maxright = MAX2(r0, r1);
      break;

   case 0x1:
      /* only even line written (quad top row) */
      minleft = l0;
      maxright = r0;
      break;

   case 0x2:
      /* only odd line written (quad bottom row) */
      minleft = l1;
      maxright = r1;
      break;

   default:
      return;
   }

   /* OK, we're very likely to need the tile data now.
    * clear or finish waiting if needed.
    */
   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
      /* wait for mfc_get() to complete */
      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
      wait_on_mask(1 << TAG_READ_TILE_COLOR);
      spu.cur_ctile_status = TILE_STATUS_CLEAN;
   }
   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
      clear_c_tile(&spu.ctile);
      spu.cur_ctile_status = TILE_STATUS_DIRTY;
   }
   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);

   if (spu.read_depth_stencil) {
      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
         /* wait for mfc_get() to complete */
         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
         wait_on_mask(1 << TAG_READ_TILE_Z);
         spu.cur_ztile_status = TILE_STATUS_CLEAN;
      }
      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
         clear_z_tile(&spu.ztile);
         spu.cur_ztile_status = TILE_STATUS_DIRTY;
      }
      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
   }

   /* XXX this loop could be moved into the above switch cases... */
   
   /* Setup for mask calculation */
   const vec_int4 quad_LlRr = setup.span.quad;
   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));

   const vec_int4 twos = spu_splats(2);

   const int x = block(minleft);
   vec_int4 xs = {x, x+1, x, x+1};

   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
      /**
       * Computes mask to indicate which pixels in the 2x2 quad are actually
       * inside the triangle's bounds.
       */
      
      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
      
      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);

      /* Combine results to create mask */
      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);

      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
   }

   setup.span.y = 0;
   setup.span.y_flags = 0;
   /* Zero right elements */
   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
}


#if DEBUG_VERTS
static void
print_vertex(const struct vertex_header *v)
{
   uint i;
   fprintf(stderr, "  Vertex: (%p)\n", v);
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
              spu_extract(v->data[i], 0),
              spu_extract(v->data[i], 1),
              spu_extract(v->data[i], 2),
              spu_extract(v->data[i], 3));
   }
}
Esempio n. 16
0
void draw_frame(uint64_t buf_ea) {
    vec_uint4 buf[2*1920/4];
    int row, col, i, tag = 0;
    float step = 4.0f/spu.width*spu.zoom;
    float xbeg = spu.xc - spu.width*step*0.5f;
    vec_float4 vxbeg = spu_splats(xbeg)
    + spu_splats(step) * (vec_float4) {
        0.f,1.f,2.f,3.f
    };
    vec_float4 xstep = spu_splats(step)*spu_splats(4.f);
    vec_float4 vyp = spu_splats(spu.yc - spu.height*step*0.5f + step*spu.rank);
    const vec_float4 vinc = spu_splats(spu.count * step);
    const vec_float4 esc2 = spu_splats(BAILOUT*BAILOUT);
#if BAILBITS != 1
    const vec_float4 esc21 = spu_splats(4.f/(BAILOUT*BAILOUT));
#endif
    const vec_float4 two = spu_splats(2.f);
    const vec_float4 zero = spu_splats(0.f);
    const vec_float4 colsc = spu_splats(255.f);
    const vec_float4 ccr = spu_splats(4.f*BAILOUT/(3.5f*3.141592654f));
    const vec_float4 ccg = spu_splats(4.f*BAILOUT/(5.f*3.141592654f));
    const vec_float4 ccb = spu_splats(4.f*BAILOUT/(9.f*3.141592654f));
    vec_float4 x, y, x2, y2, m2, vxp;
    vec_uint4 cmp, inc;
    vec_uint4 vi;
    vec_uint4 *p, *b;
    vec_float4 co;

    /* Process the full image. As there are 6 SPUs working in parallel, each with
     * a different rank from 0 to 5, each SPU processes only the line numbers:
     * rank, rank+6, rank+12, ...
     * The program uses a SPU DMA programming technique known as "double buffering",
     * where the previously generated line is transmitted to main memory while we
     * compute the next one, hence the need for a local buffer containing two lines.
     */
    for (row = spu.rank; row < spu.height; row += spu.count) {
        /* Pixel buffer address (in local memory) of the next line to be drawn */
        b = p = buf + ((1920/4)&-tag);
        vxp = vxbeg; /* first four x coordinates */
        /* Process a whole screen line by packets of 4 pixels */
        for (col = spu.width/4; col > 0 ; col--) {
            vi = spu_splats(0u);
            x = vxp;
            y = vyp;
            i = 0;
            cmp = spu_splats(-1u);
            inc = spu_splats(1u);
            m2 = zero;

            /* This loop processes the Mandelbrot suite for the four complex numbers
             * whose real part are the components of the x vector, and the imaginary
             * part are in y (as we process the same line, all initial values of y
             * are equal).
             * We perform loop unrolling for SPU performance optimization reasons,
             * hence the 4x replication of the same computation block.
             */
            do {
                x2 = x*x;
                y2 = y*y;
                m2 = spu_sel(m2, x2+y2, cmp);
                cmp = spu_cmpgt(esc2, m2);
                inc = spu_and(inc, cmp); /* increment the iteration count only if */
                vi = vi + inc;           /* we're still inside the bailout radius */
                y = two*x*y + vyp;
                x = x2-y2 + vxp;

                x2 = x*x;
                y2 = y*y;
                m2 = spu_sel(m2, x2+y2, cmp);
                cmp = spu_cmpgt(esc2, m2);
                inc = spu_and(inc, cmp);
                vi = vi + inc;
                y = two*x*y + vyp;
                x = x2-y2 + vxp;

                x2 = x*x;
                y2 = y*y;
                m2 = spu_sel(m2, x2+y2, cmp);
                cmp = spu_cmpgt(esc2, m2);
                inc = spu_and(inc, cmp);
                vi = vi + inc;
                y = two*x*y + vyp;
                x = x2-y2 + vxp;

                x2 = x*x;
                y2 = y*y;
                m2 = spu_sel(m2, x2+y2, cmp);
                cmp = spu_cmpgt(esc2, m2);
                inc = spu_and(inc, cmp);
                vi = vi + inc;
                y = two*x*y + vyp;
                x = x2-y2 + vxp;

                i += 4;
            }
            /* Exit the loop only if the iteration limit of 128 has been reached,
             * or all current four points are outside the bailout radius.
             * The __builtin_expect(xxx, 1) construct hints the compiler that the xxx
             * test has greater chance of being true (1), so a branch hinting
             * instruction is inserted into the binary code to make the conditional
             * branch faster in most cases (except the last one when we exit the
             * loop). This results in performance increase.
             */
            while (__builtin_expect((i < 128) &
                                    (si_to_int((qword)spu_gather(cmp)) != 0), 1));
            /* smooth coloring: compute the fractional part */
            co = spu_convtf(vi, 0) + spu_splats(1.f);
            co -= fast_logf(fast_logf(m2) * spu_splats(.5f));
#if BAILBITS != 1
            co = spu_re(spu_rsqrte(co*esc21));
#endif
            /* Compute the red, green an blue pixel components */
            vec_uint4 cr = spu_convtu(mcos(co * ccr) * colsc, 0);
            vec_uint4 cg = spu_convtu(mcos(co * ccg) * colsc, 0);
            vec_uint4 cb = spu_convtu(mcos(co * ccb) * colsc, 0);
            /* Put the 4 pixel values in the buffer */
            *p++ = (spu_sl(cr, 16) | spu_sl(cg, 8) | cb) & ~-inc;

            vxp += xstep;
        }

        /* double-buffered dma: initiate a dma transfer of last computed scanline
         * then wait for completion of the second last transfer (previous computed
         * line). This is done by changing the tag value.
         */
        mfc_put(b, buf_ea+(spu.width*4)*row, spu.width*4, tag, 0, 0);
        tag = 1 - tag;
        wait_for_completion(tag);
        vyp += vinc;
    }
    /* wait for completion of last sent image line */
    wait_for_completion(1-tag);
}
Esempio n. 17
0
static INLINE vector float
splatw(vector float v)
{
   return spu_splats(spu_extract(v, CHAN3));
}
Esempio n. 18
0
/* returns rough log2(x) approximation. */
inline vec_float4 fast_logf(vec_float4 x) {
    return spu_convtf(((vec_int4)(qword)x) - spu_splats((127<<23)-486411), 23);
}
Esempio n. 19
0
/* returns -cos(x)*.5+.5 */
inline vec_float4 mcos(vec_float4 x) {
    const vec_float4 half = spu_splats(.5f);
    return fast_sinf(x + spu_splats(1.5f)) * half + half;
}
Esempio n. 20
0
int main()
{
   TEST_SET_START("20040920142553EJL","EJL", "recipf4");

   unsigned int i0r = 0x7fffffff;  
   unsigned int i1 = 0xff000000;   // -2^127
   unsigned int i2 = 0xfe7fffff;   // -2^126 - 1 ulp
   unsigned int i2r = 0x80800001;  
   unsigned int i3 =   0x75013340; // random values
   unsigned int i3r =  0x09fd9f35;
   unsigned int i4 =   0x75e7753f; 
   unsigned int i4r =  0x090d9277;
   unsigned int i5 =   0x4c7fed5a; 
   unsigned int i5r =  0x32800954;
   unsigned int i6 =   0x3a0731f0; 
   unsigned int i6r =  0x44f2602e; 
   unsigned int i7 =   0x69784a07; 
   unsigned int i7r =  0x1583f9a3;

   float x0 = hide_float(0.0f);
   float x0r = hide_float(make_float(i0r));
   float x1 = hide_float(make_float(i1));
   float x1r = hide_float(0.0f);
   float x2 = hide_float(make_float(i2));
   float x2r = hide_float(make_float(i2r));   
   float x3 = hide_float(make_float(i3));
   float x3r = hide_float(make_float(i3r));   
   float x4 = hide_float(make_float(i4));     
   float x4r = hide_float(make_float(i4r));   
   float x5 = hide_float(make_float(i5));     
   float x5r = hide_float(make_float(i5r));   
   float x6 = hide_float(make_float(i6));     
   float x6r = hide_float(make_float(i6r));   
   float x7 = hide_float(make_float(i7));
   float x7r = hide_float(make_float(i7r));
   
   vec_float4 x0_v = spu_splats(x0);
   vec_float4 x0r_v = spu_splats(x0r);
   vec_float4 x1_v = spu_splats(x1);
   vec_float4 x1r_v = spu_splats(x1r);
   vec_float4 x2_v = spu_splats(x2);
   vec_float4 x2r_v = spu_splats(x2r);
   vec_float4 x3_v = spu_splats(x3);
   vec_float4 x3r_v = spu_splats(x3r);
   vec_float4 x4_v = spu_splats(x4);
   vec_float4 x4r_v = spu_splats(x4r);
   vec_float4 x5_v = spu_splats(x5);
   vec_float4 x5r_v = spu_splats(x5r);
   vec_float4 x6_v = spu_splats(x6);
   vec_float4 x6r_v = spu_splats(x6r);
   vec_float4 x7_v = spu_splats(x7);
   vec_float4 x7r_v = spu_splats(x7r);
   
   vec_float4 res_v;

   TEST_START("recipf4");
   res_v = recipf4(x0_v);
   TEST_CHECK("20040920142558EJL", allequal_float4( res_v, x0r_v ), 0);
   res_v = recipf4(x1_v);
   TEST_CHECK("20040920142600EJL", allequal_float4( res_v, x1r_v), 0);
   res_v = recipf4(x2_v);
   TEST_CHECK("20040920142602EJL", allequal_ulps_float4( res_v, x2r_v, 2 ), 0);
   res_v = recipf4(x3_v);
   TEST_CHECK("20040920142604EJL", allequal_ulps_float4( res_v, x3r_v, 2 ), 0);
   res_v = recipf4(x4_v);
   TEST_CHECK("20040920142606EJL", allequal_ulps_float4( res_v, x4r_v, 2 ), 0);
   res_v = recipf4(x5_v);
   TEST_CHECK("20040920142608EJL", allequal_ulps_float4( res_v, x5r_v, 2 ), 0);
   res_v = recipf4(x6_v);
   TEST_CHECK("20040920142609EJL", allequal_ulps_float4( res_v, x6r_v, 2 ), 0);
   res_v = recipf4(x7_v);
   TEST_CHECK("20040920142611EJL", allequal_ulps_float4( res_v, x7r_v, 2 ), 0);
   
   TEST_SET_DONE();
   
   TEST_EXIT();
}
Esempio n. 21
0
int main()
{
   TEST_SET_START("20040928174038EJL","EJL", "rsqrtd2");
   
   unsigned long long i6 =   0x7464fff515d76f87ull;
   unsigned long long i6r =  0x25b3c03b72dba06cull;
   unsigned long long i7 =   0x7606a4533cf5605eull;
   unsigned long long i7r =  0x24e3056f4b45f6a9ull;
   unsigned long long i8 =   0x4beae58c6f48733eull;
   unsigned long long i8r =  0x39f173b787396c5full;
   unsigned long long i9 =   0x3999ed5c8316b00bull;
   unsigned long long i9r =  0x43192359a70ec761ull;
   unsigned long long i10 =  0x68f7885c4b84b793ull;
   unsigned long long i10r = 0x2b6a62d48c269d90ull;
   unsigned long long i11 =  0x1aabc083c5c26227ull;
   unsigned long long i11r = 0x52912e543817fabbull;

   double x0 = hide_double(-HUGE_VAL);           // -Inf -> NaN
   double x1 = hide_double(HUGE_VAL);            // Inf -> +0
   double x2 = hide_double(0.0);                // +0  -> Inf
   double x3 = hide_double(-0.0);               // -0  -> -Inf
   double x4 = hide_double(nan(""));            // NaN -> NaN
   double x5 = hide_double(4.0);
   double x5r = hide_double(0.5);
   double x6 = hide_double(make_double(i6));
   double x6r = hide_double(make_double(i6r));
   double x7 = hide_double(make_double(i7));
   double x7r = hide_double(make_double(i7r));
   double x8 = hide_double(make_double(i8));
   double x8r = hide_double(make_double(i8r));
   double x9 = hide_double(make_double(i9));
   double x9r = hide_double(make_double(i9r));   
   double x10 = hide_double(make_double(i10));     
   double x10r = hide_double(make_double(i10r));   
   double x11 = hide_double(make_double(i11));     
   double x11r = hide_double(make_double(i11r));   
   
   vec_double2 x0_v = spu_splats(x0);
   vec_double2 x1_v = spu_splats(x1);
   vec_double2 x2_v = spu_splats(x2);
   vec_double2 x3_v = spu_splats(x3);
   vec_double2 x4_v = spu_splats(x4);
   vec_double2 x5_v = spu_splats(x5);
   vec_double2 x5r_v = spu_splats(x5r);
   vec_double2 x6_v = spu_splats(x6);
   vec_double2 x6r_v = spu_splats(x6r);
   vec_double2 x7_v = spu_splats(x7);
   vec_double2 x7r_v = spu_splats(x7r);
   vec_double2 x8_v = spu_splats(x8);
   vec_double2 x8r_v = spu_splats(x8r);
   vec_double2 x9_v = spu_splats(x9);
   vec_double2 x9r_v = spu_splats(x9r);
   vec_double2 x10_v = spu_splats(x10);
   vec_double2 x10r_v = spu_splats(x10r);
   vec_double2 x11_v = spu_splats(x11);
   vec_double2 x11r_v = spu_splats(x11r);
   
   vec_double2 res_v;

   TEST_START("rsqrtd2");
   res_v = rsqrtd2(x0_v);
   TEST_CHECK("20040928174042EJL", allnan_double2( res_v ), 0);
   res_v = rsqrtd2(x1_v);
   TEST_CHECK("20040928174045EJL", allposzero_double2( res_v ), 0);
   res_v = rsqrtd2(x2_v);
   TEST_CHECK("20040928174047EJL", allposinf_double2( res_v ), 0);
   res_v = rsqrtd2(x3_v);
   TEST_CHECK("20040928174049EJL", allneginf_double2( res_v ), 0);
   res_v = rsqrtd2(x4_v);
   TEST_CHECK("20040928174054EJL", allnan_double2( res_v ), 0);
   res_v = rsqrtd2(x5_v);
   TEST_CHECK("20040928174058EJL", allequal_double2( res_v, x5r_v ), 0);
   res_v = rsqrtd2(x6_v);
   TEST_CHECK("20040928174101EJL", allequal_ulps_double2( res_v, x6r_v, 1 ), 0);
   res_v = rsqrtd2(x7_v);
   TEST_CHECK("20040928174104EJL", allequal_ulps_double2( res_v, x7r_v, 1 ), 0);
   res_v = rsqrtd2(x8_v);
   TEST_CHECK("20040928174106EJL", allequal_ulps_double2( res_v, x8r_v, 1 ), 0);
   res_v = rsqrtd2(x9_v);
   TEST_CHECK("20040928174108EJL", allequal_ulps_double2( res_v, x9r_v, 1 ), 0);
   res_v = rsqrtd2(x10_v);
   TEST_CHECK("20040928174110EJL", allequal_ulps_double2( res_v, x10r_v, 1 ), 0);
   res_v = rsqrtd2(x11_v);
   TEST_CHECK("20040928174113EJL", allequal_ulps_double2( res_v, x11r_v, 1 ), 0);
      
   TEST_SET_DONE();

   TEST_EXIT();
}
Esempio n. 22
0
void ClipToRectangle(vf32 clip_min, vf32 clip_max)
{
  // convert from world coords to integer rows & cols
  vf32 norm_min = spu_mul(clip_min - origin_world, spu_splats(inv_step));
  vf32 norm_max = spu_mul(clip_max - origin_world, spu_splats(inv_step));
  vi32 int_min  = VecFloor4(norm_min);
  vi32 int_max  = VecCeil4 (norm_max);

  // expand rectangle by 1 gridpoint because quads incident to the verts we're about to cull out will also be culled out
  i32 c_min = spu_extract(int_min, 0) - 1;
  i32 c_max = spu_extract(int_max, 0) + 1;
  i32 r_min = spu_extract(int_min, 2) - 1;
  i32 r_max = spu_extract(int_max, 2) + 1;

  // trim loop bounds to rectangle so we don't splat memory
  i32 c0 = c_min >= 0 ? c_min   : 0;
  i32 c1 = c_max < nc ? c_max+1 : nc;
  i32 r0 = r_min >= 0 ? r_min   : 0;
  i32 r1 = r_max < nr ? r_max+1 : nr;


  // cull left points
  if (c_min>=0 && c_min<nc)
  {
    u8 *p = &g_Outcodes[r0*nc+c_min];
    for (i32 r=r0; r<r1; r++,p+=nc)
    {
      *p |= 0x80;
    }
  }

  // cull right points
  if (c_max>=0 && c_max<nc)
  {
    u8 *p = &g_Outcodes[r0*nc+c_max];
    for (i32 r=r0; r<r1; r++,p+=nc)
    {
      *p |= 0x80;
    }
  }

  // cull upper points
  if (r_min>=0 && r_min<nr)
  {
    u8 *p = &g_Outcodes[r_min*nc+c0];
    for (i32 c=c0; c<c1; c++,p++)
    {
      *p |= 0x80;
    }
  }

  // cull lower points
  if (r_max>=0 && r_max<nr)
  {
    u8 *p = &g_Outcodes[r_max*nc+c0];
    for (i32 c=c0; c<c1; c++,p++)
    {
      *p |= 0x80;
    }
  }
}
Esempio n. 23
0
int main()
{
   TEST_SET_START("20040928105926EJL","EJL", "divf4");

   unsigned int i0n = 0x75013340;
   unsigned int i0d = 0x75e7753f;
   unsigned int i0r = 0x3e8ee64b;
   unsigned int i1n = 0x4c7fed5a;
   unsigned int i1d = 0x3a0731f0;
   unsigned int i1r = 0x51f24e86;
   unsigned int i2n = 0x5b08b303;
   unsigned int i2d = 0x562f5046;
   unsigned int i2r = 0x44479d24;
   unsigned int i3n = 0x748a9b87;
   unsigned int i3d = 0x6b014b46;
   unsigned int i3r = 0x49093864;
   unsigned int i4n = 0x35dcf9d8;
   unsigned int i4d = 0x6278d6e0;
   unsigned int i4r = 0x12e355b5;
   unsigned int i5n = 0x74d505fd;
   unsigned int i5d = 0x61ef565e;
   unsigned int i5r = 0x5263daa3;

   float x0n = hide_float(make_float(i0n));
   float x0d = hide_float(make_float(i0d));
   float x0r = hide_float(make_float(i0r));
                                       
   float x1n = hide_float(make_float(i1n));
   float x1d = hide_float(make_float(i1d));
   float x1r = hide_float(make_float(i1r));
                                       
   float x2n = hide_float(make_float(i2n));
   float x2d = hide_float(make_float(i2d));
   float x2r = hide_float(make_float(i2r));
                                       
   float x3n = hide_float(make_float(i3n));
   float x3d = hide_float(make_float(i3d));
   float x3r = hide_float(make_float(i3r));
                                       
   float x4n = hide_float(make_float(i4n));
   float x4d = hide_float(make_float(i4d));
   float x4r = hide_float(make_float(i4r));
                                       
   float x5n = hide_float(make_float(i5n));
   float x5d = hide_float(make_float(i5d));
   float x5r = hide_float(make_float(i5r));
   
   vec_float4 x0n_v = spu_splats(x0n);
   vec_float4 x0d_v = spu_splats(x0d);
   vec_float4 x0r_v = spu_splats(x0r);
                                   
   vec_float4 x1n_v = spu_splats(x1n);
   vec_float4 x1d_v = spu_splats(x1d);
   vec_float4 x1r_v = spu_splats(x1r);
                                   
   vec_float4 x2n_v = spu_splats(x2n);
   vec_float4 x2d_v = spu_splats(x2d);
   vec_float4 x2r_v = spu_splats(x2r);
                                   
   vec_float4 x3n_v = spu_splats(x3n);
   vec_float4 x3d_v = spu_splats(x3d);
   vec_float4 x3r_v = spu_splats(x3r);
                                   
   vec_float4 x4n_v = spu_splats(x4n);
   vec_float4 x4d_v = spu_splats(x4d);
   vec_float4 x4r_v = spu_splats(x4r);
                                   
   vec_float4 x5n_v = spu_splats(x5n);
   vec_float4 x5d_v = spu_splats(x5d);
   vec_float4 x5r_v = spu_splats(x5r);
   
   vec_float4 res_v;

   TEST_START("divf4");
   res_v = divf4(x0n_v, x0d_v);
   TEST_CHECK("20040928105932EJL", allequal_ulps_float4( res_v, x0r_v, 2 ), 0);
   res_v = divf4(x1n_v, x1d_v);
   TEST_CHECK("20040928105934EJL", allequal_ulps_float4( res_v, x1r_v, 2 ), 0);
   res_v = divf4(x2n_v, x2d_v);
   TEST_CHECK("20040928105936EJL", allequal_ulps_float4( res_v, x2r_v, 2 ), 0);
   res_v = divf4(x3n_v, x3d_v);
   TEST_CHECK("20040928105938EJL", allequal_ulps_float4( res_v, x3r_v, 2 ), 0);
   res_v = divf4(x4n_v, x4d_v);
   TEST_CHECK("20040928105940EJL", allequal_ulps_float4( res_v, x4r_v, 2 ), 0);
   res_v = divf4(x5n_v, x5d_v);
   TEST_CHECK("20040928105943EJL", allequal_ulps_float4( res_v, x5r_v, 2 ), 0);
   
   TEST_SET_DONE();
   
   TEST_EXIT();
}
Esempio n. 24
0
int main()
{
   TEST_SET_START("20040928191240EJL","EJL", "fmodf4");

   unsigned int i0n = 0x449edbc6;
   unsigned int i0d = 0x40cf799d;
   unsigned int i0r = 0x3daa7300;
   unsigned int i1n = 0x6bca107a;
   unsigned int i1d = 0x6c4a107a;
   unsigned int i1r = 0x6bca107a;
   unsigned int i2n = 0x1c123605;
   unsigned int i2d = 0x1c923602;
   unsigned int i2r = 0x1c123605;
   unsigned int i3n = 0x2b4c50fa;
   unsigned int i3d = 0x253a3ae3;
   unsigned int i3r = 0x25141df9;
   unsigned int i4n = 0x73addffc;
   unsigned int i4d = 0x742ddffc;
   unsigned int i4r = 0x73addffc;
   unsigned int i5n = 0x29d4d97c;
   unsigned int i5d = 0x2a546e77;
   unsigned int i5r = 0x29d4d97c;

   float x0n = hide_float(make_float(i0n));
   float x0d = hide_float(make_float(i0d));
   float x0r = hide_float(make_float(i0r));
                                       
   float x1n = hide_float(make_float(i1n));
   float x1d = hide_float(make_float(i1d));
   float x1r = hide_float(make_float(i1r));
                                       
   float x2n = hide_float(make_float(i2n));
   float x2d = hide_float(make_float(i2d));
   float x2r = hide_float(make_float(i2r));
                                       
   float x3n = hide_float(make_float(i3n));
   float x3d = hide_float(make_float(i3d));
   float x3r = hide_float(make_float(i3r));
                                       
   float x4n = hide_float(make_float(i4n));
   float x4d = hide_float(make_float(i4d));
   float x4r = hide_float(make_float(i4r));
                                       
   float x5n = hide_float(make_float(i5n));
   float x5d = hide_float(make_float(i5d));
   float x5r = hide_float(make_float(i5r));
   
   vec_float4 x0n_v = spu_splats(x0n);
   vec_float4 x0d_v = spu_splats(x0d);
   vec_float4 x0r_v = spu_splats(x0r);
                                   
   vec_float4 x1n_v = spu_splats(x1n);
   vec_float4 x1d_v = spu_splats(x1d);
   vec_float4 x1r_v = spu_splats(x1r);
                                   
   vec_float4 x2n_v = spu_splats(x2n);
   vec_float4 x2d_v = spu_splats(x2d);
   vec_float4 x2r_v = spu_splats(x2r);
                                   
   vec_float4 x3n_v = spu_splats(x3n);
   vec_float4 x3d_v = spu_splats(x3d);
   vec_float4 x3r_v = spu_splats(x3r);
                                   
   vec_float4 x4n_v = spu_splats(x4n);
   vec_float4 x4d_v = spu_splats(x4d);
   vec_float4 x4r_v = spu_splats(x4r);
                                   
   vec_float4 x5n_v = spu_splats(x5n);
   vec_float4 x5d_v = spu_splats(x5d);
   vec_float4 x5r_v = spu_splats(x5r);
   
   float res;
   vec_float4 res_v;

   TEST_START("fmodf4");
   res_v = fmodf4(x0n_v, x0d_v);
   TEST_CHECK("20040928191245EJL", allequal_ulps_float4( res_v, x0r_v, 1 ), 0);
   res_v = fmodf4(x1n_v, x1d_v);
   TEST_CHECK("20040928191247EJL", allequal_ulps_float4( res_v, x1r_v, 1 ), 0);
   res_v = fmodf4(x2n_v, x2d_v);
   TEST_CHECK("20040928191249EJL", allequal_ulps_float4( res_v, x2r_v, 1 ), 0);
   res_v = fmodf4(x3n_v, x3d_v);
   TEST_CHECK("20040928191251EJL", allequal_ulps_float4( res_v, x3r_v, 1 ), 0);
   res_v = fmodf4(x4n_v, x4d_v);
   TEST_CHECK("20040928191253EJL", allequal_ulps_float4( res_v, x4r_v, 1 ), 0);
   res_v = fmodf4(x5n_v, x5d_v);
   TEST_CHECK("20040928191255EJL", allequal_ulps_float4( res_v, x5r_v, 1 ), 0);
   
   TEST_START("fmodf");
   res = fmodf(x0n, x0d);
   TEST_CHECK("20040928191258EJL", ulpDiff_f( res, x0r ) <= 1, 0);
   res = fmodf(x1n, x1d);
   TEST_CHECK("20040928191300EJL", ulpDiff_f( res, x1r ) <= 1, 0);
   res = fmodf(x2n, x2d);
   TEST_CHECK("20040928191302EJL", ulpDiff_f( res, x2r ) <= 1, 0);
   res = fmodf(x3n, x3d);               
   TEST_CHECK("20040928191303EJL", ulpDiff_f( res, x3r ) <= 1, 0);
   res = fmodf(x4n, x4d);
   TEST_CHECK("20040928191305EJL", ulpDiff_f( res, x4r ) <= 1, 0);
   res = fmodf(x5n, x5d);
   TEST_CHECK("20040928191307EJL", ulpDiff_f( res, x5r ) <= 1, 0);

   TEST_SET_DONE();
   
   TEST_EXIT();
}
Esempio n. 25
0
int main()
{
   TEST_SET_START("20040916145017EJL","EJL", "floorf");
   
   unsigned int i3 = 0x4affffff;  // 2^23 - 0.5, largest truncatable value.           
   unsigned int i3i = 0x4afffffe;                                                      
   unsigned int i4 = 0x4b000000;  // 2^23, no fractional part.                        
   unsigned int i5 = 0xcf000001;  // -2^31, one more large, and negative, value.      

   float x0 = hide_float(0.91825f);
   float x0i = hide_float(0.0f);
   float x1 = hide_float(-0.12958f);
   float x1i = hide_float(-1.0f);
   float x2 = hide_float(-79615.1875f);
   float x2i = hide_float(-79616.0f);
   float x3 = hide_float(make_float(i3));
   float x3i = hide_float(make_float(i3i));
   float x4 = hide_float(make_float(i4));
   float x4i = hide_float(make_float(i4));
   float x5 = hide_float(make_float(i5));
   float x5i = hide_float(make_float(i5));

   vec_float4 x0_v = spu_splats(x0);
   vec_float4 x0i_v = spu_splats(x0i);
   vec_float4 x1_v = spu_splats(x1);
   vec_float4 x1i_v = spu_splats(x1i);
   vec_float4 x2_v = spu_splats(x2);
   vec_float4 x2i_v = spu_splats(x2i);
   vec_float4 x3_v = spu_splats(x3);
   vec_float4 x3i_v = spu_splats(x3i);
   vec_float4 x4_v = spu_splats(x4);
   vec_float4 x4i_v = spu_splats(x4i);
   vec_float4 x5_v = spu_splats(x5);
   vec_float4 x5i_v = spu_splats(x5i);
   
   float res;
   vec_float4 res_v;

   TEST_START("floorf4");
   res_v = floorf4(x0_v);
   TEST_CHECK("20040916145022EJL", allequal_float4( res_v, x0i_v ), 0);
   res_v = floorf4(x1_v);
   TEST_CHECK("20040916145024EJL", allequal_float4( res_v, x1i_v ), 0);
   res_v = floorf4(x2_v);
   TEST_CHECK("20040916145027EJL", allequal_float4( res_v, x2i_v ), 0);
   res_v = floorf4(x3_v);
   TEST_CHECK("20040916145029EJL", allequal_float4( res_v, x3i_v ), 0);
   res_v = floorf4(x4_v);
   TEST_CHECK("20040916145032EJL", allequal_float4( res_v, x4i_v ), 0);
   res_v = floorf4(x5_v);
   TEST_CHECK("20040916145034EJL", allequal_float4( res_v, x5i_v ), 0);
   
   TEST_START("floorf");
   res = floorf(x0);
   TEST_CHECK("20040916155814EJL", res == x0i, 0);
   res = floorf(x1);
   TEST_CHECK("20040916155818EJL", res == x1i, 0);
   res = floorf(x2);
   TEST_CHECK("20040916155822EJL", res == x2i, 0);
   res = floorf(x3);
   TEST_CHECK("20040916155825EJL", res == x3i, 0);
   res = floorf(x4);
   TEST_CHECK("20040916155827EJL", res == x4i, 0);
   res = floorf(x5);
   TEST_CHECK("20040916155830EJL", res == x5i, 0);
   
   TEST_SET_DONE();
   
   TEST_EXIT();
}
Esempio n. 26
0
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks)
{
    const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) {	// merge lo bytes from unsigned shorts (array)
        1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
    };

    const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) {	// get busy flag with ones in unused bytes
        0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0
    };

    const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0);

    char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ];

    char	sync_buffer[128+127];
    void*	aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 );

    RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer;
    unsigned long long cache_ea;

    spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD);
    mfc_write_tag_mask(1<<0);
    mfc_read_tag_status_all();

    while (cache_ea) {
        // terminate immediately if possible
        if (spu_stat_in_mbox())
            return;

        // read the cache line
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
        spu_readch(MFC_RdAtomicStat);

        unsigned int endTriangle = cache->endTriangle;
        vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle);

        // first look for short chunks
        vec_uchar16 next = cache->chunkNext;
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 start0 = cache->chunkStart[0];
        vec_ushort8 start1 = cache->chunkStart[1];

        vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) );
        vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) );

        vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0);
        vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1);

        vec_ushort8 len0 = spu_sub( end0, start0);
        vec_ushort8 len1 = spu_sub( end1, start1);

        vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0);
        vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1);
        vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE );
        vec_uint4 smallChunkGather = spu_gather(small);

        // check to see if chunk is already at the last triangle
        vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle(
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]),
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]),
                SHUFFLE_MERGE_BYTES) );

        // check if the chunk is free
        vec_uint4 freeChunkGather = spu_gather(
                                        spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );

        // check to see if the chunk is being processed
        vec_uint4 busyChunkGather = spu_gather(
                                        spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK),
                                                spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) );

        // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0
        // note that if freeChunkGather is true then busyChunkGather must also be true

        // done=false, free=false, busy=false -> can process
        // free=false, busy=false -> can be merged

        // decide which chunk to process
        vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather );
        vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather );

        vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) );
        vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask );

        /*
        		if (!spu_extract(shortSelMask, 0))
        			printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n",
        				spu_extract(mayProcessGather, 0) & 0xffff,
        				spu_extract(smallChunkGather, 0),
        				spu_extract(mayProcessShortGather, 0),
        				spu_extract(shortSelMask, 0) & 0xffff,
        				spu_extract(mayProcessSelection, 0) & 0xffff );
        */

        vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16);
        unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0);
        unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0);

        // if there's nothing to process, try the next cache line in the rendering tasks list
        if (!spu_extract(mayProcessBits, 0)) {
trynextcacheline:
            cache_ea = cache->next;
            // sleep();
            continue;
        }

        unsigned int chunkStart    	= cache->chunkStartArray   [chunkToProcess];
        unsigned int chunkTriangle	= cache->chunkTriangleArray[chunkToProcess];
        unsigned int chunkNext		= cache->chunkNextArray	   [chunkToProcess] & CHUNKNEXT_MASK;
        unsigned int chunkEnd		= (cache->chunkStartArray  [chunkNext]-1) & (NUMBER_OF_TILES-1);
        unsigned int chunkLength	= 1 + chunkEnd-chunkStart;

        // only need an extra block if the block is especially long
        if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) {
            freeChunk = 32;
        }

        // mark this block as busy
        cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT;

        // if there's at least one free chunk, claim it
        if (freeChunk != 32) {
            cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED;
            cache->chunkTriangleArray[freeChunk] = chunkTriangle;
        }

        // write the cache line back
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
            continue;

#ifdef INFO
        printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID,
               chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle,
               freeChunk!=32 ? freeChunk : -1 );
//		debug_render_tasks(cache);
#endif

        Triangle* triangle;
        int firstTile;
        do {
            // read the triangle data for the current triangle
            unsigned int extra = chunkTriangle & 127;
            unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
            triangle = (Triangle*) (trianglebuffer+extra);
            unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

            // ensure DMA slot available
            do {} while (!spu_readchcnt(MFC_Cmd));

            spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea),
                         length, 0, MFC_GET_CMD);
            mfc_write_tag_mask(1<<0);
            mfc_read_tag_status_all();

            // get the triangle deltas
            firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd);

            if (firstTile>=0)
                break;

            // no match, try next triangle
            chunkTriangle = triangle->next_triangle;
        } while (chunkTriangle != endTriangle);

        // if we actually have something to process...
        if (firstTile>=0) {
            // the "normal" splitting will now become:
            // chunkStart .. (firstTile-1)	-> triangle->next_triangle
            // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY)
            // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE)

            int tailChunk;
            int thisChunk;
            int nextBlockStart;
            int thisBlockStart;
            int realBlockStart;
            do {
retry:
                // read the cache line
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
                spu_readch(MFC_RdAtomicStat);

                // calculate start of next block
                nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK;
                if (nextBlockStart > chunkEnd)
                    nextBlockStart = chunkEnd+1;

                // calculate start of block to mark as busy
                thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK;
                if (thisBlockStart < chunkStart)
                    thisBlockStart = chunkStart;
                realBlockStart = thisBlockStart;

#ifdef INFO
                printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID,
                       nextBlockStart, realBlockStart, thisBlockStart, chunkStart);
#endif


                // allocate some more free chunks
                vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq(
                                                        spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16);
                unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);

                if (freeChunk == 32) {
                    // if we didn't have one before, try again
                    freeChunk = freeChunk2;

                    // and try to get the second one
                    freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) );
                    freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);
                } else {
                    // speculatively clear the free chunk just in case we don't need it
                    cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK;
                }

#ifdef INFO
                printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n",
                       _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart );
#endif

                // mark region after as available for processing if required
                if (nextBlockStart < chunkEnd) {
                    if (freeChunk==32) {
                        // if no free chunk, relinquish entire block and write back
                        cache->chunkNextArray[chunkToProcess] = chunkNext;
                        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
                        // if writeback failed, we *might* have a free block, retry
                        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
                            goto retry;

                        // otherwise give up and try the next cache line
                        goto trynextcacheline;
                    }
                    cache->chunkStartArray[freeChunk] = nextBlockStart;
                    cache->chunkNextArray[freeChunk] = chunkNext;
                    cache->chunkTriangleArray[freeChunk] = chunkTriangle;
                    cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT;
                    tailChunk = freeChunk;
#ifdef INFO
                    printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess);
                    debug_render_tasks(cache);
#endif
                } else {
                    // we're gonna use freeChunk2 for the "in front" block, as we've not
                    // used freeChunk, let's use it as it's more likely to have a free chunk
                    freeChunk2 = freeChunk;
                    tailChunk = chunkNext;
                }

                // mark region before as available if required and possible
                thisChunk = chunkToProcess;
                if (thisBlockStart > chunkStart) {
                    if (freeChunk2 != 32) {
                        // mark this region as busy
                        cache->chunkStartArray[freeChunk2]=thisBlockStart;
                        cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        cache->chunkTriangleArray[freeChunk2]=chunkTriangle;

                        // mark region before as available for processing
                        cache->chunkNextArray[chunkToProcess]=freeChunk2;
                        cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle;
                        thisChunk = freeChunk2;
#ifdef INFO
                        printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#endif
                    } else {
                        // need to keep whole block, update info and mark bust
                        cache->chunkTriangleArray[chunkToProcess]=chunkTriangle;
                        cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        realBlockStart = chunkStart;
                        printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#ifdef INFO
#endif
                        sleep();
                    }
                }

                // merge chunks
                merge_cache_blocks(cache);

                // write the cache line back
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
            } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS);

            // finally after the write succeeded, update the variables
            chunkNext = tailChunk;
            chunkToProcess = thisChunk;
            chunkStart = firstTile; //thisBlockStart;
            chunkLength = nextBlockStart - firstTile;
            chunkEnd = chunkStart + chunkLength - 1;
            freeChunk = 32;

            // now we can process the block up to endTriangle
            initTileBuffers(thisBlockStart, chunkEnd);

            int ok=0;
            while (chunkTriangle != endTriangle) {
#ifdef INFO
                printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n",
                       _SPUID, chunkToProcess, chunkStart, chunkLength,
                       chunkTriangle, firstTile, thisBlockStart);
#endif
                // and actually process that triangle on these chunks
                processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok);
                ok=1;
#ifdef PAUSE
                sleep();
#endif
                // and advance to the next-triangle
                chunkTriangle = triangle->next_triangle;

                // this should only ever happen if we're running really low on cache line slots
                // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and
                // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles.
                // in this case, we process from thisBlockStart only (because we know that from
                // chunkStart to there has no result) and then we only process one triangle
                if (chunkStart != realBlockStart) {
                    /*
                    printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, "
                    	"firstTile=%d chunk=%d\n",
                    	_SPUID, chunkStart, realBlockStart, chunkEnd,
                    	firstTile, chunkToProcess);
                    debug_render_tasks(cache);
                    */

                    // abort the while loop
                    break;
                }

                // read the next triangle
                unsigned int extra = chunkTriangle & 127;
                unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
                triangle = (Triangle*) (trianglebuffer+extra);
                unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

                // ensure DMA slot available
                do {} while (!spu_readchcnt(MFC_Cmd));

                spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea),
                             mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD);
                mfc_write_tag_mask(1<<0);
                mfc_read_tag_status_all();
            } // until chunkTriangle == endTriangle

            // flush any output buffers
            flushTileBuffers(thisBlockStart, chunkEnd);

        } // firstTile>=0
Esempio n. 27
0
void InitBasisEtc()
{
  // Use a fixed initial step size for now; 128m for lod 0.
  // This yields an fft tile size of 32 x 128m = 4096m for lod 0
  // and maximum dimensions of 8192m x 8192m
  step = g_R2OCon.m_step;
  vf32 step_vec = (vf32){step, 0, step, 0};
  
  // get inverse-step using float magic (since taking the reciprocal of a power of 2 yields a 1-bit error)
  qword q_step  = si_from_float(step);
  qword q_magic = si_ilhu(0x7F00);
  inv_step = si_to_float(si_sf(q_step, q_magic));

  // set clip window
  clip_min = g_WaterObject.m_origin;
  clip_max = g_WaterObject.m_origin + g_WaterObject.m_dimensions;

  // set origin at gridpoint below clip min
  f32 magic_float = 1.5f * 8388608.0f * step;
  vf32 magic_vf32 = (vf32){magic_float, 0, magic_float, 0};
  origin_world = (clip_min + magic_vf32) - magic_vf32;

  // compute gridpoint above clip max
  vf32 max_corner = (clip_max + magic_vf32) - magic_vf32;
  max_corner += step_vec;

  // offset both corners by the necessary amount of padding
  origin_world -= step_vec * spu_splats(8.0f);
  max_corner   += step_vec * spu_splats(8.0f);

  // set num cols & num rows
  vf32 dims = max_corner - origin_world;
  nc = (i32)(spu_extract(dims,0) * inv_step) + 1;
  nr = (i32)(spu_extract(dims,2) * inv_step) + 1;

  // record true nc, nr
  true_nc = nc - 16;
  true_nr = nr - 16;

  // alignment requirements (ooh, that's a bit strict)
  nc = (nc + 7) & -8;
  nr = (nr + 7) & -8;

  // deal with large grids
  if (nc > 80)
  {
    nc = 80;
    true_nc = 64;
    dims = spu_insert((nc-1)*step, dims, 0);
  }
  if (nr > 80)
  {
    nr = 80;
    true_nr = 64;
    dims = spu_insert((nr-1)*step, dims, 2);
  }
  max_corner = origin_world + dims;


  even_step = step;
  even_inv_step = inv_step;
  even_basis_col = (vf32){1.0f, 0.0f, 0.0f, 0.0f};
  even_basis_row = (vf32){0.0f, 0.0f, 1.0f, 0.0f};

  const f32 r = 0.707106781187f;
  odd_step    = even_step * r;
  odd_inv_step= even_inv_step * r * 2.0f;
  odd_basis_col = (vf32){ r, 0.0f, r, 0.0f};
  odd_basis_row = (vf32){-r, 0.0f, r, 0.0f};

  basis_col = even_basis_col;
  basis_row = even_basis_row;
  dvc_world = spu_splats(step) * basis_col;
  dvr_world = spu_splats(step) * basis_row;

  // set base lod origin
  g_RenderData.m_origins[0]   = origin_world;
  g_RenderData.m_cols_rows[0] = nc<<8 | nr;
  c0_amb = 0;
  r0_amb = 0;

  SetBasisEtc(0,0);
}
Esempio n. 28
0
vector double
__divv2df3 (vector double a_in, vector double b_in)
{
    /* Variables */
    vec_int4    exp, exp_bias;
    vec_uint4   no_underflow, overflow;
    vec_float4  mant_bf, inv_bf;
    vec_ullong2 exp_a, exp_b;
    vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0;
    vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0;
    vec_ullong2 nan;
    vec_uint4   a_exp, b_exp;
    vec_ullong2 a_mant_0, b_mant_0;
    vec_ullong2 a_exp_1s, b_exp_1s;
    vec_ullong2 sign_exp_mask;

    vec_double2 a, b;
    vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;

    /* Constants */
    vec_uint4   exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
    vec_uchar16 splat_hi = (vec_uchar16) {
        0,1,2,3, 0,1,2,3,  8, 9,10,11, 8,9,10,11
    };
    vec_uchar16 swap_32 = (vec_uchar16) {
        4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
    };
    vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
    vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
    vec_float4  onef = spu_splats(1.0f);
    vec_double2 one = spu_splats(1.0);
    vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);

    sign_exp_mask = spu_or(sign_mask, exp_mask);

    /* Extract the floating point components from each of the operands including
     * exponent and mantissa.
     */
    a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
    a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
    b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
    b_exp = spu_shuffle(b_exp, b_exp, splat_hi);

    a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
    a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));

    b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
    b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));

    a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
    b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);

    /* Identify all possible special values that must be accommodated including:
     * +-denorm, +-0, +-infinity, and NaNs.
     */
    a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0);
    a_nan    = spu_andc(a_exp_1s, a_mant_0);
    a_zero   = spu_and (a_denorm0, a_mant_0);
    a_inf    = spu_and (a_exp_1s, a_mant_0);
    a_denorm = spu_andc(a_denorm0, a_zero);

    b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0);
    b_nan    = spu_andc(b_exp_1s, b_mant_0);
    b_zero   = spu_and (b_denorm0, b_mant_0);
    b_inf    = spu_and (b_exp_1s, b_mant_0);
    b_denorm = spu_andc(b_denorm0, b_zero);

    /* Scale denorm inputs to into normalized numbers by conditionally scaling the
     * input parameters.
     */
    a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
    a = spu_sel(a_in, a, a_denorm);

    b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
    b = spu_sel(b_in, b, b_denorm);

    /* Extract the divisor and dividend exponent and force parameters into the signed
     * range [1.0,2.0) or [-1.0,2.0).
     */
    exp_a = spu_and((vec_ullong2)a, exp_mask);
    exp_b = spu_and((vec_ullong2)b, exp_mask);

    mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
    mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);

    /* Approximate the single reciprocal of b by using
     * the single precision reciprocal estimate followed by one
     * single precision iteration of Newton-Raphson.
     */
    mant_bf = spu_roundtf(mant_b);
    inv_bf = spu_re(mant_bf);
    inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);

    /* Perform 2 more Newton-Raphson iterations in double precision. The
     * result (q1) is in the range (0.5, 2.0).
     */
    inv_b = spu_extend(inv_bf);
    inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
    q0 = spu_mul(mant_a, inv_b);
    q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);

    /* Determine the exponent correction factor that must be applied
     * to q1 by taking into account the exponent of the normalized inputs
     * and the scale factors that were applied to normalize them.
     */
    exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
    exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));

    /* Bias the quotient exponent depending on the sign of the exponent correction
     * factor so that a single multiplier will ensure the entire double precision
     * domain (including denorms) can be achieved.
     *
     *    exp 	       bias q1     adjust exp
     *   =====	       ========    ==========
     *   positive         2^+65         -65
     *   negative         2^-64         +64
     */
    exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
    exp = spu_sub(exp, exp_bias);

    q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);

    /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
     * expected result. On overflow, clamp the multiplier to the maximum non-infinite
     * number in case the rounding mode is not round-to-nearest.
     */
    exp = spu_add(exp, 0x3FF);
    no_underflow = spu_cmpgt(exp, 0);
    overflow = spu_cmpgt(exp, 0x7FE);
    exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
    exp = spu_and(exp, (vec_int4)exp_mask);

    mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow);

    /* Handle special value conditions. These include:
     *
     * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
     *    results.
     * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
     * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
     */
    mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
    mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));

    nan = spu_or(a_nan, b_nan);
    nan = spu_or(nan, spu_and(a_zero, b_zero));
    nan = spu_or(nan, spu_and(a_inf, b_inf));

    mult = spu_or(mult, (vec_double2)nan);

    /* Scale the final quotient */

    q2 = spu_mul(q1, mult);

    return (q2);
}
Esempio n. 29
0
int main()
{
   TEST_SET_START("20040908022501EJL","EJL", "fabs");
   
   double x0n = hide_double(-0.0);
   double x0p = hide_double(0.0);
   double x1n = hide_double(-83532.96153153);
   double x1p = hide_double(83532.96153153);
   double x2n = hide_double(-0.0000000013152);
   double x2p = hide_double(0.0000000013152);
   double x3n = hide_double(-HUGE_VAL);
   double x3p = hide_double(HUGE_VAL);
   
   vec_double2 x0n_v = spu_splats(x0n);
   vec_double2 x0p_v = spu_splats(x0p);
   vec_double2 x1n_v = spu_splats(x1n);
   vec_double2 x1p_v = spu_splats(x1p);
   vec_double2 x2n_v = spu_splats(x2n);
   vec_double2 x2p_v = spu_splats(x2p);
   vec_double2 x3n_v = spu_splats(x3n);
   vec_double2 x3p_v = spu_splats(x3p);
   
   double res;
   vec_double2 res_v;

   TEST_START("fabsd2");
   res_v = fabsd2(x0n_v);
   TEST_CHECK("20040908022502EJL", allequal_double2( res_v, x0p_v ), 0);
   res_v = fabsd2(x0p_v);
   TEST_CHECK("20040908022503EJL", allequal_double2( res_v, x0p_v ), 0);
   res_v = fabsd2(x1n_v);
   TEST_CHECK("20040908022504EJL", allequal_double2( res_v, x1p_v ), 0);
   res_v = fabsd2(x1p_v);
   TEST_CHECK("20040908022505EJL", allequal_double2( res_v, x1p_v ), 0);
   res_v = fabsd2(x2n_v);
   TEST_CHECK("20040908022506EJL", allequal_double2( res_v, x2p_v ), 0);
   res_v = fabsd2(x2p_v);
   TEST_CHECK("20040908022507EJL", allequal_double2( res_v, x2p_v ), 0);
   res_v = fabsd2(x3n_v);
   TEST_CHECK("20040908022508EJL", allposinf_double2( res_v ), 0);
   res_v = fabsd2(x3p_v);
   TEST_CHECK("20040908022509EJL", allposinf_double2( res_v ), 0);
   
   TEST_START("fabs");
   res = fabs( x0n );
   TEST_CHECK("20040908022510EJL", res == x0p, 0);
   res = fabs( x0p );
   TEST_CHECK("20040908022511EJL", res == x0p, 0);
   res = fabs( x1n );
   TEST_CHECK("20040908022512EJL", res == x1p, 0);
   res = fabs( x1p );
   TEST_CHECK("20040908022513EJL", res == x1p, 0);
   res = fabs( x2n );
   TEST_CHECK("20040908022514EJL", res == x2p, 0);
   res = fabs( x2p );
   TEST_CHECK("20040908022515EJL", res == x2p, 0);
   res = fabs( x3n );
   TEST_CHECK("20040908022516EJL", isinf(res) == 1, 0);
   res = fabs( x3p );
   TEST_CHECK("20040908022517EJL", isinf(res) == 1, 0);
   
   TEST_SET_DONE();
   
   TEST_EXIT();
}
Esempio n. 30
0
int main()
{
   TEST_SET_START("20060825000000AAN","AAN", "isgreaterequald2");

   //-QNaN: NG
   double x0 = hide_double(-nan(""));
   double y0 = hide_double(1.0);
   unsigned long long r0 = 0x0000000000000000ull;

   //+Inf > -Inf
   double x1 = hide_double( HUGE_VAL);
   double y1 = hide_double(-HUGE_VAL);
   unsigned long long r1 = 0xffffffffffffffffull;

   //-Inf < -Dmax
   double x2 = hide_double(-HUGE_VAL);
   double y2 = hide_double(-DBL_MAX);
   unsigned long long r2 = 0x0000000000000000ull;

   //-Norm > -Inf
   double x3 = hide_double(-67418234.34256245);
   double y3 = hide_double(-HUGE_VAL);
   unsigned long long r3 = 0xffffffffffffffffull;

   //-Norm < -Denorm
   double x4 = hide_double(-273453.3234458053);
   double y4 = hide_double(-3.0e-321);
   unsigned long long r4 = 0x0000000000000000ull;

   //-Norm = -Norm
   double x5 = hide_double(-168.97345223013);
   double y5 = hide_double(-168.97345223013);
   unsigned long long r5 = 0xffffffffffffffffull;

   //-Norm > -Norm
   double x6 = hide_double(-168.97345223013);
   double y6 = hide_double(-21345853556.492);
   unsigned long long r6 = 0xffffffffffffffffull;

   //-Norm < -0
   double x7 = hide_double(-168.97345223013);
   double y7 = hide_double(-0.0);
   unsigned long long r7 = 0x0000000000000000ull;

   //-Unf > -Norm
   double x8 = hide_double(-1.0e-999);
   double y8 = hide_double(-83532.96153153);
   unsigned long long r8 = 0xffffffffffffffffull;

   //-Unf = 0
   double x9 = hide_double(-1.0e-999);
   double y9 = hide_double(0.0);
   unsigned long long r9 = 0xffffffffffffffffull;

   //-0 = 0
   double x10 = hide_double(-0.0);
   double y10 = hide_double( 0.0);
   unsigned long long r10 = 0xffffffffffffffffull;

   //+Unf = 0
   double x11 = hide_double( 1.0e-999);
   double y11 = hide_double( 0.0);
   unsigned long long r11 = 0xffffffffffffffffull;

   //+Unf < +Norm
   double x12 = hide_double( 1e-999);
   double y12 = hide_double(0.0031529324);
   unsigned long long r12 = 0x0000000000000000ull;

   //+Norm > +Denorm
   double x13 = hide_double(5172.2845321);
   double y13 = hide_double(3.0e-321);
   unsigned long long r13 = 0xffffffffffffffffull;

   //+Norm = +Norm
   double x14 = hide_double(5172.2845321);
   double y14 = hide_double(5172.2845321);
   unsigned long long r14 = 0xffffffffffffffffull;

   //+Norm < +Norm
   double x15 = hide_double(264.345643345);
   double y15 = hide_double(2353705.31415);
   unsigned long long r15 = 0x0000000000000000ull;

   //+Norm > -Norm
   double x16 = hide_double( 926.605118542);
   double y16 = hide_double(-9.43574552184);
   unsigned long long r16 = 0xffffffffffffffffull;

   //+Norm < +Dmax
   double x17 = hide_double( 926.605118542);
   double y17 = hide_double(DBL_MAX);
   unsigned long long r17 = 0x0000000000000000ull;
   
   //+Inf > +Dmax
   double x18 = hide_double(HUGE_VAL);
   double y18 = hide_double(DBL_MAX);
   unsigned long long r18 = 0xffffffffffffffffull;

   //+QNaN: NG
   double x19 = hide_double(nan(""));
   double y19 = hide_double(3.14);
   unsigned long long r19 = 0x0000000000000000ull;

   vec_double2 x0_v = spu_splats(x0);
   vec_double2 y0_v = spu_splats(y0);
   vec_ullong2 r0_v = spu_splats(r0);

   vec_double2 x1_v = spu_splats(x1);
   vec_double2 y1_v = spu_splats(y1);
   vec_ullong2 r1_v = spu_splats(r1);

   vec_double2 x2_v = spu_splats(x2);
   vec_double2 y2_v = spu_splats(y2);
   vec_ullong2 r2_v = spu_splats(r2);

   vec_double2 x3_v = spu_splats(x3);
   vec_double2 y3_v = spu_splats(y3);
   vec_ullong2 r3_v = spu_splats(r3);

   vec_double2 x4_v = spu_splats(x4);
   vec_double2 y4_v = spu_splats(y4);
   vec_ullong2 r4_v = spu_splats(r4);

   vec_double2 x5_v = spu_splats(x5);
   vec_double2 y5_v = spu_splats(y5);
   vec_ullong2 r5_v = spu_splats(r5);

   vec_double2 x6_v = spu_splats(x6);
   vec_double2 y6_v = spu_splats(y6);
   vec_ullong2 r6_v = spu_splats(r6);

   vec_double2 x7_v = spu_splats(x7);
   vec_double2 y7_v = spu_splats(y7);
   vec_ullong2 r7_v = spu_splats(r7);

   vec_double2 x8_v = spu_splats(x8);
   vec_double2 y8_v = spu_splats(y8);
   vec_ullong2 r8_v = spu_splats(r8);

   vec_double2 x9_v = spu_splats(x9);
   vec_double2 y9_v = spu_splats(y9);
   vec_ullong2 r9_v = spu_splats(r9);

   vec_double2 x10_v = spu_splats(x10);
   vec_double2 y10_v = spu_splats(y10);
   vec_ullong2 r10_v = spu_splats(r10);

   vec_double2 x11_v = spu_splats(x11);
   vec_double2 y11_v = spu_splats(y11);
   vec_ullong2 r11_v = spu_splats(r11);

   vec_double2 x12_v = spu_splats(x12);
   vec_double2 y12_v = spu_splats(y12);
   vec_ullong2 r12_v = spu_splats(r12);

   vec_double2 x13_v = spu_splats(x13);
   vec_double2 y13_v = spu_splats(y13);
   vec_ullong2 r13_v = spu_splats(r13);

   vec_double2 x14_v = spu_splats(x14);
   vec_double2 y14_v = spu_splats(y14);
   vec_ullong2 r14_v = spu_splats(r14);

   vec_double2 x15_v = spu_splats(x15);
   vec_double2 y15_v = spu_splats(y15);
   vec_ullong2 r15_v = spu_splats(r15);

   vec_double2 x16_v = spu_splats(x16);
   vec_double2 y16_v = spu_splats(y16);
   vec_ullong2 r16_v = spu_splats(r16);

   vec_double2 x17_v = spu_splats(x17);
   vec_double2 y17_v = spu_splats(y17);
   vec_ullong2 r17_v = spu_splats(r17);

   vec_double2 x18_v = spu_splats(x18);
   vec_double2 y18_v = spu_splats(y18);
   vec_ullong2 r18_v = spu_splats(r18);

   vec_double2 x19_v = spu_splats(x19);
   vec_double2 y19_v = spu_splats(y19);
   vec_ullong2 r19_v = spu_splats(r19);
     
   vec_ullong2 res_v;

   TEST_START("isgreaterequald2");

   res_v = (vec_ullong2)isgreaterequald2(x0_v, y0_v);
   TEST_CHECK("20060825000000AAN", allequal_ullong2( res_v, r0_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x1_v, y1_v);
   TEST_CHECK("20060825000001AAN", allequal_ullong2( res_v, r1_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x2_v, y2_v);
   TEST_CHECK("20060825000002AAN", allequal_ullong2( res_v, r2_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x3_v, y3_v);
   TEST_CHECK("20060825000003AAN", allequal_ullong2( res_v, r3_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x4_v, y4_v);
   TEST_CHECK("20060825000004AAN", allequal_ullong2( res_v, r4_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x5_v, y5_v);
   TEST_CHECK("20060825000005AAN", allequal_ullong2( res_v, r5_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x6_v, y6_v);
   TEST_CHECK("20060825000006AAN", allequal_ullong2( res_v, r6_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x7_v, y7_v);
   TEST_CHECK("20060825000007AAN", allequal_ullong2( res_v, r7_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x8_v, y8_v);
   TEST_CHECK("20060825000008AAN", allequal_ullong2( res_v, r8_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x9_v, y9_v);
   TEST_CHECK("20060825000009AAN", allequal_ullong2( res_v, r9_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x10_v, y10_v);
   TEST_CHECK("20060825000000AAN", allequal_ullong2( res_v, r10_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x11_v, y11_v);
   TEST_CHECK("20060825000001AAN", allequal_ullong2( res_v, r11_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x12_v, y12_v);
   TEST_CHECK("20060825000002AAN", allequal_ullong2( res_v, r12_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x13_v, y13_v);
   TEST_CHECK("20060825000003AAN", allequal_ullong2( res_v, r13_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x14_v, y14_v);
   TEST_CHECK("20060825000004AAN", allequal_ullong2( res_v, r14_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x15_v, y15_v);
   TEST_CHECK("20060825000005AAN", allequal_ullong2( res_v, r15_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x16_v, y16_v);
   TEST_CHECK("20060825000006AAN", allequal_ullong2( res_v, r16_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x17_v, y17_v);
   TEST_CHECK("20060825000007AAN", allequal_ullong2( res_v, r17_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x18_v, y18_v);
   TEST_CHECK("20060825000008AAN", allequal_ullong2( res_v, r18_v ), 0);
   res_v = (vec_ullong2)isgreaterequald2(x19_v, y19_v);
   TEST_CHECK("20060825000009AAN", allequal_ullong2( res_v, r19_v ), 0);
   
   TEST_SET_DONE();
   
   TEST_EXIT();
}