Esempio n. 1
0
/**
 * @v
 * @vth
 */
inline vec_float4 updateF(vec_float4 v,vec_float4 vth)
{
  vec_float4 updateF_rtn;
  updateF_rtn = spu_sel(updateF_rtn,1.0,spu_cmpgt(spu_sub(v,vth),0.0));
  updateF_rtn = spu_sel(updateF_rtn,0.0,spu_nand(spu_cmpgt(spu_sub(v,vth),0.0),spu_cmpgt(spu_sub(v,vth),0.0)));
  return updateF_rtn;
}
Esempio n. 2
0
  void compute(
    in0_type  const& in,
    out0_type const& out,
    Pinfo const&     p_in,
    Pinfo const&     p_out)
  {
#if DEBUG
    printf("uk_ccfft_f(%d): compute -- start %05x %05x\n", size, in, out);
#endif
    // Handle inverse FFT explicitly so that shuffle and scale can happen
    // in single step.
    cml_core_ccfft1d_op_mi_f(fft, (float*)in, (float*)out, CML_FFT_FWD);

    if (dir == -1)
    {
      if (scale != 1.f)
	cml_core_rcsvmul1_f(scale, (float*)out, (float*)out, size);
    }
    else
    {
      // Code for the inverse FFT taken from the CBE SDK Libraries
      // Overview and Users Guide, sec. 8.1.
      int const vec_size = 4;
      vector float* start = (vector float*)out;
      vector float* end   = start + 2 * size / vec_size;
      vector float  s0, s1, e0, e1;
      vector unsigned int mask = (vector unsigned int){-1, -1, 0, 0};
      vector float vscale = spu_splats(scale);
      unsigned int i;
      
      // Scale the output vector and swap the order of the outputs.
      // Note: there are two float values for each of 'n' complex values.
      s0 = e1 = *start;
      for (i = 0; i < size / vec_size; ++i) 
      {
	s1 = *(start + 1);
	e0 = *(--end);
	
	*start++ = spu_mul(spu_sel(e0, e1, mask), vscale);
	*end     = spu_mul(spu_sel(s0, s1, mask), vscale);
	s0 = s1;
	e1 = e0;
      }
    }
  }

  // Member data
  size_t      size;
  int         dir;
  float       scale;

  fft1d_f*    fft;

  static char buf1[FFT_BUF1_SIZE_BYTES];
  static char buf2[FFT_BUF2_SIZE_BYTES];
};
Esempio n. 3
0
static btVector3 convexHullSupport (const btVector3& localDirOrg, const btVector3* points, int numPoints, const btVector3& localScaling)
{	

	btVector3 vec = localDirOrg * localScaling;

#if defined (__CELLOS_LV2__) && defined (__SPU__)

	btVector3 localDir = vec;

	vec_float4 v_distMax = {-FLT_MAX,0,0,0};
	vec_int4 v_idxMax = {-999,0,0,0};
	int v=0;
	int numverts = numPoints;

	for(;v<(int)numverts-4;v+=4) {
		vec_float4 p0 = vec_dot3(points[v  ].get128(),localDir.get128());
		vec_float4 p1 = vec_dot3(points[v+1].get128(),localDir.get128());
		vec_float4 p2 = vec_dot3(points[v+2].get128(),localDir.get128());
		vec_float4 p3 = vec_dot3(points[v+3].get128(),localDir.get128());
		const vec_int4 i0 = {v  ,0,0,0};
		const vec_int4 i1 = {v+1,0,0,0};
		const vec_int4 i2 = {v+2,0,0,0};
		const vec_int4 i3 = {v+3,0,0,0};
		vec_uint4  retGt01 = spu_cmpgt(p0,p1);
		vec_float4 pmax01 = spu_sel(p1,p0,retGt01);
		vec_int4   imax01 = spu_sel(i1,i0,retGt01);
		vec_uint4  retGt23 = spu_cmpgt(p2,p3);
		vec_float4 pmax23 = spu_sel(p3,p2,retGt23);
		vec_int4   imax23 = spu_sel(i3,i2,retGt23);
		vec_uint4  retGt0123 = spu_cmpgt(pmax01,pmax23);
		vec_float4 pmax0123 = spu_sel(pmax23,pmax01,retGt0123);
		vec_int4   imax0123 = spu_sel(imax23,imax01,retGt0123);
		vec_uint4  retGtMax = spu_cmpgt(v_distMax,pmax0123);
		v_distMax = spu_sel(pmax0123,v_distMax,retGtMax);
		v_idxMax = spu_sel(imax0123,v_idxMax,retGtMax);
	}
	for(;v<(int)numverts;v++) {
		vec_float4 p = vec_dot3(points[v].get128(),localDir.get128());
		const vec_int4 i = {v,0,0,0};
		vec_uint4  retGtMax = spu_cmpgt(v_distMax,p);
		v_distMax = spu_sel(p,v_distMax,retGtMax);
		v_idxMax = spu_sel(i,v_idxMax,retGtMax);
	}
	int ptIndex = spu_extract(v_idxMax,0);
	const btVector3& supVec= points[ptIndex] * localScaling;
	return supVec;
#else

    btScalar maxDot;
    long ptIndex = vec.maxDot( points, numPoints, maxDot);
	btAssert(ptIndex >= 0);
	btVector3 supVec = points[ptIndex] * localScaling;
	return supVec;
#endif //__SPU__
}
Esempio n. 4
0
void check_pull_dma(int side){
  // Check left
  if(md[am].held_tag[side] < 32){
    mfc_write_tag_mask( 1 << md[am].held_tag[side] );
    int status = mfc_read_tag_status_immediate();

    if(status){
      // Update idx
      md[am].idx[side][HEAD] = spu_add(md[am].idx[side][HEAD], md[am].num_waiting[side]);

      vector signed int buffer_size = spu_splats(mcb[am].buffer_size[side] -1);
      vector unsigned int cmp_v = spu_cmpgt(md[am].idx[side][HEAD], buffer_size);
      vector signed int zeros = {0,0,0,0};
      buffer_size = spu_add(buffer_size,1);
      zeros = spu_sel(zeros,buffer_size,cmp_v);
      md[am].idx[side][HEAD] = spu_sub(md[am].idx[side][HEAD],zeros);

      md[am].num_pulled[side] += md[am].num_waiting[side];      
      md[am].num_waiting[side] = 0;
      if(md[am].num_pulled[side] == mcb[am].data_size[side]){
	md[am].mm_depleted[side] = 1;
      }
      // Release tag
      mfc_tag_release( md[am].held_tag[side] );
      md[am].held_tag[side] = 32;      
    }
  }
}
Esempio n. 5
0
vec_uint4 ulpDiff_f4(vec_float4 ref, vec_float4 vals) {
  vec_int4 refi  = (vec_int4)ref;
  vec_int4 valsi = (vec_int4)vals;
  vec_int4 diff  = spu_sub(refi, valsi);
  vec_int4 negdiff = spu_sub(spu_splats((int)0), diff);

  return (vec_uint4)(spu_sel(negdiff, diff, spu_cmpgt(diff, 0)));
}
unsigned int
__mfc_multi_tag_reserve (unsigned int number_of_tags)
{
  vector unsigned int table_copy;
  vector unsigned int one = (vector unsigned int)
        { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
  vector unsigned int count_busy, is_valid;
  vector unsigned int count_total;
  vector unsigned int count_avail = (vector unsigned int) { 0, 0, 0, 0 };
  vector unsigned int index = (vector unsigned int) { 0, 0, 0, 0 };

  table_copy = __mfc_tag_table;


  /* count_busy: number of consecutive busy tags
     count_avail: number of consecutive free tags
     table_copy: temporary copy of the tag table
     count_total: sum of count_busy and count_avail
     index: index of the current working tag  */
  do
    {
      table_copy = spu_sl (table_copy, count_avail);

      count_busy = spu_cntlz (table_copy);
      table_copy = spu_sl (table_copy, count_busy);
      count_avail = spu_cntlz (spu_xor(table_copy, -1));
      count_total = spu_add (count_busy, count_avail);
      index = spu_add (index, count_total);
    }
  while (spu_extract (count_avail, 0) < number_of_tags
	 && spu_extract (table_copy, 0) != 0);

  index = spu_sub (index, count_avail);

  /* is_valid is set to 0xFFFFFFFF if table_copy == 0, 0 otherwise.  */
  is_valid = spu_cmpeq (table_copy, 0);
  index = spu_sel (index, is_valid, is_valid);

  /* Now I need to actually mark the tags as used.  */
  table_copy = spu_sl (one, number_of_tags);
  table_copy = spu_rl (table_copy, -number_of_tags - spu_extract (index, 0));
  table_copy = spu_sel (table_copy, __mfc_tag_table, table_copy);
  __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_valid);

  return spu_extract (index, 0);
}
Esempio n. 7
0
int main(int argc, char **argv) {
   int i;
   vector unsigned int all_ones = (vector unsigned int) 
      {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};

   vector unsigned int all_zeroes = (vector unsigned int) 
      {0x00000000, 0x00000000, 0x00000000, 0x00000000};
   
   /* These bits will form the selection mask */
   unsigned short mask = 0x9;
   
   /* Each bit in 0x9 forms a word in the mask */
   vector unsigned int resultw = 
      spu_sel(all_zeroes, all_ones, spu_maskw(mask));
   printf("resultw: ");
   for (i=0; i<4; i++) {
      printf("%08x", spu_extract(resultw, i));
   }
   
   /* Each bit in 0x09 forms a halfword in the mask */
   vector unsigned short resulth = 
      spu_sel((vector unsigned short)all_zeroes, 
              (vector unsigned short)all_ones, 
              spu_maskh(mask));
   printf("\nresulth: ");
   for (i=0; i<8; i++) {
      printf("%04x", spu_extract(resulth, i));
   }

   /* Each bit in 0x0009 forms a byte in the mask */
   vector unsigned char resultb = 
      spu_sel((vector unsigned char)all_zeroes, 
              (vector unsigned char)all_ones, 
              spu_maskb(mask));
   printf("\nresultb: ");
   for (i=0; i<16; i++) {
      printf("%02x", spu_extract(resultb, i));
   }
   printf("\n");
   return 0;
}
Esempio n. 8
0
void cp_buffer(int side){
  int avail_out = num_free_in_buffer(OUT);
  int avail_side = num_in_buffer(side);
  int max = avail_out < avail_side ? avail_out : avail_side;

  vector signed int *out_head;
  if(mcb[am].local[OUT] < 255)
    out_head = (vector signed int*) &md[ mcb[am].local[OUT] ].idx[ (mcb[am].id+1)&1 ][HEAD];
  else
    out_head = (vector signed int*) &md[am].idx[OUT][HEAD];

  vector unsigned int cmp_v;
  vector signed int from_size = spu_splats( mcb[am].buffer_size[side] );
  vector signed int out_size = spu_splats( mcb[ mcb[am].local[OUT] ].buffer_size[ (mcb[am].id+1)&1 ] );
  vector signed int ones = {1,1,1,1};
  vector signed int zeros = {0,0,0,0};

  int i;
  for(i = 0; i < max; i++){
    md[am].buffer[OUT][spu_extract( *out_head,0)] = md[am].buffer[side][spu_extract(md[am].idx[side][TAIL],0)];
    // update idx
    md[am].idx[side][TAIL] = spu_add(md[am].idx[side][TAIL], ones);
    cmp_v = spu_cmpeq(md[am].idx[side][TAIL],from_size);
    md[am].idx[side][TAIL] = spu_sel(md[am].idx[side][TAIL], zeros, cmp_v);

    *out_head = spu_add(*out_head,ones);
    cmp_v = spu_cmpeq(*out_head, out_size);
    *out_head = spu_sel(*out_head,zeros,cmp_v);
  }

  update_tail(side);

  md[am].consumed[side] += max;

  if(mcb[am].local[OUT] < 255 && md[am].consumed[side] == mcb[am].data_size[side]){
    md[am].depleted[side] = 1;
    md[am].done = 1;
    --num_active_mergers;
  }
}
Esempio n. 9
0
unsigned int
__mfc_multi_tag_release (unsigned int first_tag, unsigned int number_of_tags)
{
  vector unsigned int table_copy, tmp, tmp1;
  vector unsigned int one = (vector unsigned int)
        { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
  vector unsigned int is_invalid;
  unsigned int last_tag;
  vector unsigned int has_been_reserved;

  last_tag = first_tag + number_of_tags;

  table_copy = spu_sl (one, number_of_tags);
  table_copy = spu_rl (table_copy, -last_tag);
  table_copy = spu_xor (table_copy, -1);

  /* Make sure the tags are in range and valid.  */
  tmp = spu_cmpgt (spu_promote(last_tag, 0), 32);
  tmp1 = spu_cmpgt (spu_promote(number_of_tags, 0), 32);
  is_invalid =  spu_cmpgt (spu_promote(first_tag, 0), 31);

  /* All bits are set to 1 if invalid, 0 if valid.  */
  is_invalid = spu_or (tmp, is_invalid);
  is_invalid = spu_or (tmp1, is_invalid);

  /* check whether these tags have been reserved */
  tmp = spu_rlmask (one, (int)-number_of_tags);
  tmp1 = spu_sl (__mfc_tag_table, first_tag);
  has_been_reserved = spu_cmpgt(tmp1, tmp);

  is_invalid = spu_or (has_been_reserved, is_invalid);

  table_copy = spu_sel (__mfc_tag_table, table_copy, table_copy);
  __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_invalid);

  return spu_extract (is_invalid, 0);
}
Esempio n. 10
0
unsigned int
__mfc_tag_reserve (void)
{
  vector unsigned int mask = (vector unsigned int)
	{ 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
  vector unsigned int count_zeros, is_valid;
  vector signed int count_neg;

  count_zeros = spu_cntlz (__mfc_tag_table);
  count_neg = spu_sub (0, (vector signed int) count_zeros);

  mask = spu_rlmask (mask, (vector signed int) count_neg);
  __mfc_tag_table = spu_andc (__mfc_tag_table, mask);

  is_valid = spu_cmpeq (count_zeros, 32);
  count_zeros = spu_sel (count_zeros, is_valid, is_valid);

  return spu_extract (count_zeros, 0);
}
Esempio n. 11
0
/* Scans the string pointed to by s for the character c and
 * returns a pointer to the last occurance of c. If
 * c is not found, then NULL is returned.
 */
char * strrchr(const char *s, int c)
{
  int nskip;
  vec_uchar16 *ptr, data, vc;
  vec_uint4 cmp_c, cmp_0, cmp;
  vec_uint4 res_ptr, res_cmp;
  vec_uint4 mask, result;
  vec_uint4 one = spu_splats(0xffffU);
  /* Scan memory array a quadword at a time. Skip leading
   * mis-aligned bytes.
   */
  ptr = (vec_uchar16 *)s;

  nskip = -((unsigned int)(ptr) & 15);
  mask = spu_rlmask(one, nskip);

  vc = spu_splats((unsigned char)(c));

  data = *ptr++;
  ptr = (vec_uchar16 *)((unsigned int)ptr & ~15);

  cmp_c = spu_and(spu_gather(spu_cmpeq(data, vc)), mask);
  cmp_0 = spu_and(spu_gather(spu_cmpeq(data, 0)), mask);

  res_ptr = spu_splats(0U);
  res_cmp = spu_splats(0U);

  while (spu_extract(cmp_0, 0) == 0) {
    cmp = spu_cmpeq(cmp_c, 0);

    res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
    res_cmp = spu_sel(cmp_c, res_cmp, cmp);

    data = *ptr++;

    cmp_c = spu_gather(spu_cmpeq(data, vc));
    cmp_0 = spu_gather(spu_cmpeq(data, 0));

    cmp = spu_cmpeq(cmp_c, 0);
  }

  /* Compute the location of the last character before termination
   * character.
   *
   * First mask off compare results following the first termination character.
   */
  mask = spu_sl(one, 31 - spu_extract(spu_cntlz(cmp_0), 0));
  cmp_c = spu_and(cmp_c, mask);

  /* Conditionally update res_ptr and res_cmd if a match was found in the last
   * quadword.
   */
  cmp = spu_cmpeq(cmp_c, 0);

  res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
  res_cmp = spu_sel(cmp_c, res_cmp, cmp);

  /* Bit reserve res_cmp for locating last occurance.
   */
  mask = spu_cmpeq(res_cmp, 0);

  res_cmp = (vec_uint4)spu_maskb(spu_extract(res_cmp, 0));
  res_cmp = spu_gather((vec_uchar16)spu_shuffle(res_cmp, res_cmp,
						VEC_LITERAL(vec_uchar16,
							    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)));

  /* Compute the location (ptr) of the last occurance of c. If no
   * occurance was found (ie, element 0 of res_cmp == 0, then return
   * NULL.
   */
  result = spu_sub(spu_add(res_ptr, 15), spu_cntlz(res_cmp));
  result = spu_andc(result, mask);

  return ((char *)spu_extract(result, 0));
}
Esempio n. 12
0
static inline
vec_uint4 vec_Maj(vec_uint4 x, vec_uint4 y, vec_uint4 z)
{
  return spu_sel(x, y, spu_xor(x, z));
}
Esempio n. 13
0
static inline
vec_uint4 vec_Ch(vec_uint4 x, vec_uint4 y, vec_uint4 z)
{
  return spu_sel(z, y, x);
}
Esempio n. 14
0
vector double
__divv2df3 (vector double a_in, vector double b_in)
{
    /* Variables */
    vec_int4    exp, exp_bias;
    vec_uint4   no_underflow, overflow;
    vec_float4  mant_bf, inv_bf;
    vec_ullong2 exp_a, exp_b;
    vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0;
    vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0;
    vec_ullong2 nan;
    vec_uint4   a_exp, b_exp;
    vec_ullong2 a_mant_0, b_mant_0;
    vec_ullong2 a_exp_1s, b_exp_1s;
    vec_ullong2 sign_exp_mask;

    vec_double2 a, b;
    vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;

    /* Constants */
    vec_uint4   exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
    vec_uchar16 splat_hi = (vec_uchar16) {
        0,1,2,3, 0,1,2,3,  8, 9,10,11, 8,9,10,11
    };
    vec_uchar16 swap_32 = (vec_uchar16) {
        4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
    };
    vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
    vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
    vec_float4  onef = spu_splats(1.0f);
    vec_double2 one = spu_splats(1.0);
    vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);

    sign_exp_mask = spu_or(sign_mask, exp_mask);

    /* Extract the floating point components from each of the operands including
     * exponent and mantissa.
     */
    a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
    a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
    b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
    b_exp = spu_shuffle(b_exp, b_exp, splat_hi);

    a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
    a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));

    b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
    b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));

    a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
    b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);

    /* Identify all possible special values that must be accommodated including:
     * +-denorm, +-0, +-infinity, and NaNs.
     */
    a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0);
    a_nan    = spu_andc(a_exp_1s, a_mant_0);
    a_zero   = spu_and (a_denorm0, a_mant_0);
    a_inf    = spu_and (a_exp_1s, a_mant_0);
    a_denorm = spu_andc(a_denorm0, a_zero);

    b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0);
    b_nan    = spu_andc(b_exp_1s, b_mant_0);
    b_zero   = spu_and (b_denorm0, b_mant_0);
    b_inf    = spu_and (b_exp_1s, b_mant_0);
    b_denorm = spu_andc(b_denorm0, b_zero);

    /* Scale denorm inputs to into normalized numbers by conditionally scaling the
     * input parameters.
     */
    a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
    a = spu_sel(a_in, a, a_denorm);

    b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
    b = spu_sel(b_in, b, b_denorm);

    /* Extract the divisor and dividend exponent and force parameters into the signed
     * range [1.0,2.0) or [-1.0,2.0).
     */
    exp_a = spu_and((vec_ullong2)a, exp_mask);
    exp_b = spu_and((vec_ullong2)b, exp_mask);

    mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
    mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);

    /* Approximate the single reciprocal of b by using
     * the single precision reciprocal estimate followed by one
     * single precision iteration of Newton-Raphson.
     */
    mant_bf = spu_roundtf(mant_b);
    inv_bf = spu_re(mant_bf);
    inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);

    /* Perform 2 more Newton-Raphson iterations in double precision. The
     * result (q1) is in the range (0.5, 2.0).
     */
    inv_b = spu_extend(inv_bf);
    inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
    q0 = spu_mul(mant_a, inv_b);
    q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);

    /* Determine the exponent correction factor that must be applied
     * to q1 by taking into account the exponent of the normalized inputs
     * and the scale factors that were applied to normalize them.
     */
    exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
    exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));

    /* Bias the quotient exponent depending on the sign of the exponent correction
     * factor so that a single multiplier will ensure the entire double precision
     * domain (including denorms) can be achieved.
     *
     *    exp 	       bias q1     adjust exp
     *   =====	       ========    ==========
     *   positive         2^+65         -65
     *   negative         2^-64         +64
     */
    exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
    exp = spu_sub(exp, exp_bias);

    q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);

    /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
     * expected result. On overflow, clamp the multiplier to the maximum non-infinite
     * number in case the rounding mode is not round-to-nearest.
     */
    exp = spu_add(exp, 0x3FF);
    no_underflow = spu_cmpgt(exp, 0);
    overflow = spu_cmpgt(exp, 0x7FE);
    exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
    exp = spu_and(exp, (vec_int4)exp_mask);

    mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow);

    /* Handle special value conditions. These include:
     *
     * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
     *    results.
     * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
     * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
     */
    mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
    mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));

    nan = spu_or(a_nan, b_nan);
    nan = spu_or(nan, spu_and(a_zero, b_zero));
    nan = spu_or(nan, spu_and(a_inf, b_inf));

    mult = spu_or(mult, (vec_double2)nan);

    /* Scale the final quotient */

    q2 = spu_mul(q1, mult);

    return (q2);
}
Esempio n. 15
0
void draw_frame(uint64_t buf_ea) {
    vec_uint4 buf[2*1920/4];
    int row, col, i, tag = 0;
    float step = 4.0f/spu.width*spu.zoom;
    float xbeg = spu.xc - spu.width*step*0.5f;
    vec_float4 vxbeg = spu_splats(xbeg)
    + spu_splats(step) * (vec_float4) {
        0.f,1.f,2.f,3.f
    };
    vec_float4 xstep = spu_splats(step)*spu_splats(4.f);
    vec_float4 vyp = spu_splats(spu.yc - spu.height*step*0.5f + step*spu.rank);
    const vec_float4 vinc = spu_splats(spu.count * step);
    const vec_float4 esc2 = spu_splats(BAILOUT*BAILOUT);
#if BAILBITS != 1
    const vec_float4 esc21 = spu_splats(4.f/(BAILOUT*BAILOUT));
#endif
    const vec_float4 two = spu_splats(2.f);
    const vec_float4 zero = spu_splats(0.f);
    const vec_float4 colsc = spu_splats(255.f);
    const vec_float4 ccr = spu_splats(4.f*BAILOUT/(3.5f*3.141592654f));
    const vec_float4 ccg = spu_splats(4.f*BAILOUT/(5.f*3.141592654f));
    const vec_float4 ccb = spu_splats(4.f*BAILOUT/(9.f*3.141592654f));
    vec_float4 x, y, x2, y2, m2, vxp;
    vec_uint4 cmp, inc;
    vec_uint4 vi;
    vec_uint4 *p, *b;
    vec_float4 co;

    /* Process the full image. As there are 6 SPUs working in parallel, each with
     * a different rank from 0 to 5, each SPU processes only the line numbers:
     * rank, rank+6, rank+12, ...
     * The program uses a SPU DMA programming technique known as "double buffering",
     * where the previously generated line is transmitted to main memory while we
     * compute the next one, hence the need for a local buffer containing two lines.
     */
    for (row = spu.rank; row < spu.height; row += spu.count) {
        /* Pixel buffer address (in local memory) of the next line to be drawn */
        b = p = buf + ((1920/4)&-tag);
        vxp = vxbeg; /* first four x coordinates */
        /* Process a whole screen line by packets of 4 pixels */
        for (col = spu.width/4; col > 0 ; col--) {
            vi = spu_splats(0u);
            x = vxp;
            y = vyp;
            i = 0;
            cmp = spu_splats(-1u);
            inc = spu_splats(1u);
            m2 = zero;

            /* This loop processes the Mandelbrot suite for the four complex numbers
             * whose real part are the components of the x vector, and the imaginary
             * part are in y (as we process the same line, all initial values of y
             * are equal).
             * We perform loop unrolling for SPU performance optimization reasons,
             * hence the 4x replication of the same computation block.
             */
            do {
                x2 = x*x;
                y2 = y*y;
                m2 = spu_sel(m2, x2+y2, cmp);
                cmp = spu_cmpgt(esc2, m2);
                inc = spu_and(inc, cmp); /* increment the iteration count only if */
                vi = vi + inc;           /* we're still inside the bailout radius */
                y = two*x*y + vyp;
                x = x2-y2 + vxp;

                x2 = x*x;
                y2 = y*y;
                m2 = spu_sel(m2, x2+y2, cmp);
                cmp = spu_cmpgt(esc2, m2);
                inc = spu_and(inc, cmp);
                vi = vi + inc;
                y = two*x*y + vyp;
                x = x2-y2 + vxp;

                x2 = x*x;
                y2 = y*y;
                m2 = spu_sel(m2, x2+y2, cmp);
                cmp = spu_cmpgt(esc2, m2);
                inc = spu_and(inc, cmp);
                vi = vi + inc;
                y = two*x*y + vyp;
                x = x2-y2 + vxp;

                x2 = x*x;
                y2 = y*y;
                m2 = spu_sel(m2, x2+y2, cmp);
                cmp = spu_cmpgt(esc2, m2);
                inc = spu_and(inc, cmp);
                vi = vi + inc;
                y = two*x*y + vyp;
                x = x2-y2 + vxp;

                i += 4;
            }
            /* Exit the loop only if the iteration limit of 128 has been reached,
             * or all current four points are outside the bailout radius.
             * The __builtin_expect(xxx, 1) construct hints the compiler that the xxx
             * test has greater chance of being true (1), so a branch hinting
             * instruction is inserted into the binary code to make the conditional
             * branch faster in most cases (except the last one when we exit the
             * loop). This results in performance increase.
             */
            while (__builtin_expect((i < 128) &
                                    (si_to_int((qword)spu_gather(cmp)) != 0), 1));
            /* smooth coloring: compute the fractional part */
            co = spu_convtf(vi, 0) + spu_splats(1.f);
            co -= fast_logf(fast_logf(m2) * spu_splats(.5f));
#if BAILBITS != 1
            co = spu_re(spu_rsqrte(co*esc21));
#endif
            /* Compute the red, green an blue pixel components */
            vec_uint4 cr = spu_convtu(mcos(co * ccr) * colsc, 0);
            vec_uint4 cg = spu_convtu(mcos(co * ccg) * colsc, 0);
            vec_uint4 cb = spu_convtu(mcos(co * ccb) * colsc, 0);
            /* Put the 4 pixel values in the buffer */
            *p++ = (spu_sl(cr, 16) | spu_sl(cg, 8) | cb) & ~-inc;

            vxp += xstep;
        }

        /* double-buffered dma: initiate a dma transfer of last computed scanline
         * then wait for completion of the second last transfer (previous computed
         * line). This is done by changing the tag value.
         */
        mfc_put(b, buf_ea+(spu.width*4)*row, spu.width*4, tag, 0, 0);
        tag = 1 - tag;
        wait_for_completion(tag);
        vyp += vinc;
    }
    /* wait for completion of last sent image line */
    wait_for_completion(1-tag);
}
Esempio n. 16
0
void MinMaxBinFindBest3SIMD(minmaxbin_t *mmb, kdbuffer_t *result)
{
	int i;

	for(i=1; i < mmb->numbins; i++)
	{
		int j = mmb->numbins - i - 1;

		vector float *min = (vector float *)mmb->minbins[i].b;
		vector float *max = (vector float *)mmb->maxbins[j].b;

		min[0] = spu_add(min[0], min[-1]);
		max[0] = spu_add(max[0], max[1]);
	}

	vector float *vmax = (vector float*)result->baabb.max;
	vector float *vmin = (vector float*)result->baabb.min;

	vector float vwidth = spu_abs( spu_sub(*vmax, *vmin) );

	vector float vnumbins = spu_splats(1/(float)mmb->numbins);
	vector float vdelta = spu_mul(vwidth, vnumbins);
	vector float vx = spu_add(*vmin, vdelta);

	vector float vside = { vwidth[1] * vwidth[2], vwidth[0] * vwidth[2], vwidth[0] * vwidth[1], 0 };
	vector float invarea = spu_splats( 1/(vwidth[0] * vside[0]));
	vector float vctravers = spu_splats(2.0f);
	vector float vbestcost = spu_splats(mmb->bestcost);
	vector int vbesti = spu_splats(0);
	vector float vbestx = vx;

	for(i=0; i < mmb->numbins-1; i++)
	{
		vector float aleft, aright;

		AreaLeftRight(*vmin, *vmax, vside, vx, &aleft, &aright);

		vector float *vminbin = (vector float *)mmb->minbins[i].b;
		vector float *vmaxbin = (vector float *)mmb->maxbins[i+1].b;

		vector float cost = SAHCostSIMD(invarea, vctravers, *vminbin, aleft, *vmaxbin, aright);

		vector unsigned int cmp = spu_cmpgt(cost, vbestcost);
		vbestcost = spu_sel(cost, vbestcost, cmp);
		vbesti = spu_sel(spu_splats(i), vbesti, cmp);
		vbestx = spu_sel(vx, vbestx, cmp);

		vx = spu_add(vx, vdelta);	
	}	

	int axis = 0;
	float bestcost = vbestcost[axis];

	if(vbestcost[1] < bestcost)
	{
		axis = 1;
		bestcost = vbestcost[1];
	}

	if(vbestcost[2] < bestcost)
	{
		axis = 2;
		bestcost = vbestcost[2];
	}

	int index = vbesti[axis];

	result->plane = vbestx[axis];
	result->axis = axis;
	result->left_size = (int)mmb->minbins[ index ].b[axis];
	result->right_size = (int)mmb->maxbins[ index+1 ].b[axis];
	
	mmb->bestcost = vbestcost[axis];
}
Esempio n. 17
0
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks)
{
    const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) {	// merge lo bytes from unsigned shorts (array)
        1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
    };

    const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) {	// get busy flag with ones in unused bytes
        0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0
    };

    const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0);

    char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ];

    char	sync_buffer[128+127];
    void*	aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 );

    RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer;
    unsigned long long cache_ea;

    spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD);
    mfc_write_tag_mask(1<<0);
    mfc_read_tag_status_all();

    while (cache_ea) {
        // terminate immediately if possible
        if (spu_stat_in_mbox())
            return;

        // read the cache line
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
        spu_readch(MFC_RdAtomicStat);

        unsigned int endTriangle = cache->endTriangle;
        vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle);

        // first look for short chunks
        vec_uchar16 next = cache->chunkNext;
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 start0 = cache->chunkStart[0];
        vec_ushort8 start1 = cache->chunkStart[1];

        vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) );
        vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) );

        vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0);
        vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1);

        vec_ushort8 len0 = spu_sub( end0, start0);
        vec_ushort8 len1 = spu_sub( end1, start1);

        vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0);
        vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1);
        vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE );
        vec_uint4 smallChunkGather = spu_gather(small);

        // check to see if chunk is already at the last triangle
        vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle(
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]),
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]),
                SHUFFLE_MERGE_BYTES) );

        // check if the chunk is free
        vec_uint4 freeChunkGather = spu_gather(
                                        spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );

        // check to see if the chunk is being processed
        vec_uint4 busyChunkGather = spu_gather(
                                        spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK),
                                                spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) );

        // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0
        // note that if freeChunkGather is true then busyChunkGather must also be true

        // done=false, free=false, busy=false -> can process
        // free=false, busy=false -> can be merged

        // decide which chunk to process
        vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather );
        vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather );

        vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) );
        vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask );

        /*
        		if (!spu_extract(shortSelMask, 0))
        			printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n",
        				spu_extract(mayProcessGather, 0) & 0xffff,
        				spu_extract(smallChunkGather, 0),
        				spu_extract(mayProcessShortGather, 0),
        				spu_extract(shortSelMask, 0) & 0xffff,
        				spu_extract(mayProcessSelection, 0) & 0xffff );
        */

        vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16);
        unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0);
        unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0);

        // if there's nothing to process, try the next cache line in the rendering tasks list
        if (!spu_extract(mayProcessBits, 0)) {
trynextcacheline:
            cache_ea = cache->next;
            // sleep();
            continue;
        }

        unsigned int chunkStart    	= cache->chunkStartArray   [chunkToProcess];
        unsigned int chunkTriangle	= cache->chunkTriangleArray[chunkToProcess];
        unsigned int chunkNext		= cache->chunkNextArray	   [chunkToProcess] & CHUNKNEXT_MASK;
        unsigned int chunkEnd		= (cache->chunkStartArray  [chunkNext]-1) & (NUMBER_OF_TILES-1);
        unsigned int chunkLength	= 1 + chunkEnd-chunkStart;

        // only need an extra block if the block is especially long
        if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) {
            freeChunk = 32;
        }

        // mark this block as busy
        cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT;

        // if there's at least one free chunk, claim it
        if (freeChunk != 32) {
            cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED;
            cache->chunkTriangleArray[freeChunk] = chunkTriangle;
        }

        // write the cache line back
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
            continue;

#ifdef INFO
        printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID,
               chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle,
               freeChunk!=32 ? freeChunk : -1 );
//		debug_render_tasks(cache);
#endif

        Triangle* triangle;
        int firstTile;
        do {
            // read the triangle data for the current triangle
            unsigned int extra = chunkTriangle & 127;
            unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
            triangle = (Triangle*) (trianglebuffer+extra);
            unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

            // ensure DMA slot available
            do {} while (!spu_readchcnt(MFC_Cmd));

            spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea),
                         length, 0, MFC_GET_CMD);
            mfc_write_tag_mask(1<<0);
            mfc_read_tag_status_all();

            // get the triangle deltas
            firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd);

            if (firstTile>=0)
                break;

            // no match, try next triangle
            chunkTriangle = triangle->next_triangle;
        } while (chunkTriangle != endTriangle);

        // if we actually have something to process...
        if (firstTile>=0) {
            // the "normal" splitting will now become:
            // chunkStart .. (firstTile-1)	-> triangle->next_triangle
            // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY)
            // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE)

            int tailChunk;
            int thisChunk;
            int nextBlockStart;
            int thisBlockStart;
            int realBlockStart;
            do {
retry:
                // read the cache line
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
                spu_readch(MFC_RdAtomicStat);

                // calculate start of next block
                nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK;
                if (nextBlockStart > chunkEnd)
                    nextBlockStart = chunkEnd+1;

                // calculate start of block to mark as busy
                thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK;
                if (thisBlockStart < chunkStart)
                    thisBlockStart = chunkStart;
                realBlockStart = thisBlockStart;

#ifdef INFO
                printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID,
                       nextBlockStart, realBlockStart, thisBlockStart, chunkStart);
#endif


                // allocate some more free chunks
                vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq(
                                                        spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16);
                unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);

                if (freeChunk == 32) {
                    // if we didn't have one before, try again
                    freeChunk = freeChunk2;

                    // and try to get the second one
                    freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) );
                    freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);
                } else {
                    // speculatively clear the free chunk just in case we don't need it
                    cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK;
                }

#ifdef INFO
                printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n",
                       _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart );
#endif

                // mark region after as available for processing if required
                if (nextBlockStart < chunkEnd) {
                    if (freeChunk==32) {
                        // if no free chunk, relinquish entire block and write back
                        cache->chunkNextArray[chunkToProcess] = chunkNext;
                        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
                        // if writeback failed, we *might* have a free block, retry
                        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
                            goto retry;

                        // otherwise give up and try the next cache line
                        goto trynextcacheline;
                    }
                    cache->chunkStartArray[freeChunk] = nextBlockStart;
                    cache->chunkNextArray[freeChunk] = chunkNext;
                    cache->chunkTriangleArray[freeChunk] = chunkTriangle;
                    cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT;
                    tailChunk = freeChunk;
#ifdef INFO
                    printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess);
                    debug_render_tasks(cache);
#endif
                } else {
                    // we're gonna use freeChunk2 for the "in front" block, as we've not
                    // used freeChunk, let's use it as it's more likely to have a free chunk
                    freeChunk2 = freeChunk;
                    tailChunk = chunkNext;
                }

                // mark region before as available if required and possible
                thisChunk = chunkToProcess;
                if (thisBlockStart > chunkStart) {
                    if (freeChunk2 != 32) {
                        // mark this region as busy
                        cache->chunkStartArray[freeChunk2]=thisBlockStart;
                        cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        cache->chunkTriangleArray[freeChunk2]=chunkTriangle;

                        // mark region before as available for processing
                        cache->chunkNextArray[chunkToProcess]=freeChunk2;
                        cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle;
                        thisChunk = freeChunk2;
#ifdef INFO
                        printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#endif
                    } else {
                        // need to keep whole block, update info and mark bust
                        cache->chunkTriangleArray[chunkToProcess]=chunkTriangle;
                        cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        realBlockStart = chunkStart;
                        printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#ifdef INFO
#endif
                        sleep();
                    }
                }

                // merge chunks
                merge_cache_blocks(cache);

                // write the cache line back
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
            } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS);

            // finally after the write succeeded, update the variables
            chunkNext = tailChunk;
            chunkToProcess = thisChunk;
            chunkStart = firstTile; //thisBlockStart;
            chunkLength = nextBlockStart - firstTile;
            chunkEnd = chunkStart + chunkLength - 1;
            freeChunk = 32;

            // now we can process the block up to endTriangle
            initTileBuffers(thisBlockStart, chunkEnd);

            int ok=0;
            while (chunkTriangle != endTriangle) {
#ifdef INFO
                printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n",
                       _SPUID, chunkToProcess, chunkStart, chunkLength,
                       chunkTriangle, firstTile, thisBlockStart);
#endif
                // and actually process that triangle on these chunks
                processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok);
                ok=1;
#ifdef PAUSE
                sleep();
#endif
                // and advance to the next-triangle
                chunkTriangle = triangle->next_triangle;

                // this should only ever happen if we're running really low on cache line slots
                // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and
                // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles.
                // in this case, we process from thisBlockStart only (because we know that from
                // chunkStart to there has no result) and then we only process one triangle
                if (chunkStart != realBlockStart) {
                    /*
                    printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, "
                    	"firstTile=%d chunk=%d\n",
                    	_SPUID, chunkStart, realBlockStart, chunkEnd,
                    	firstTile, chunkToProcess);
                    debug_render_tasks(cache);
                    */

                    // abort the while loop
                    break;
                }

                // read the next triangle
                unsigned int extra = chunkTriangle & 127;
                unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
                triangle = (Triangle*) (trianglebuffer+extra);
                unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

                // ensure DMA slot available
                do {} while (!spu_readchcnt(MFC_Cmd));

                spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea),
                             mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD);
                mfc_write_tag_mask(1<<0);
                mfc_read_tag_status_all();
            } // until chunkTriangle == endTriangle

            // flush any output buffers
            flushTileBuffers(thisBlockStart, chunkEnd);

        } // firstTile>=0
Esempio n. 18
0
static inline
vec_uint4 vec_Maj(vec_uint4 x, vec_uint4 y, vec_uint4 z)
{
  return spu_sel(spu_and(y, z), spu_or(y, z), x);
}
Esempio n. 19
0
void merge_buffers(){
  vector unsigned int cmp_v, cmp_v2;

  const vector signed int one_at_0 = {1,0,0,0};
  const vector signed int one_at_1 = {0,1,0,0};
  const vector signed int one_at_2 = {0,0,1,0};
  const vector signed int ones = {1,1,1,1};
  const vector signed int zeros = {0,0,0,0};

  const vector unsigned char cmp_v_shuffle_mask = {31,31,31,31,
						   31,31,31,31,
						   31,31,31,31,
						   31,31,31,31};
  vector unsigned char rev_mask;
  const vector unsigned char rev_left = {12,13,14,15,
					 8,9,10,11,
					 4,5,6,7,
					 0,1,2,3};

  const vector unsigned char rev_right = {28,29,30,31,
					  24,25,26,27,
					  20,21,22,23,
					  16,17,18,19};
  vector signed int *out_head_idx;
  if(mcb[am].local[OUT] < 255){
    int parent_idx = mcb[am].local[OUT];
    int side = (mcb[am].id+1)&1;
    out_head_idx = (vector signed int*) &md[parent_idx].idx[side][HEAD];
  } else {
    out_head_idx = (vector signed int*) &md[am].idx[OUT][HEAD];
  }

  vector signed int *left_tail_idx = (vector signed int*) &md[am].idx[LEFT][TAIL];
  vector signed int *right_tail_idx = (vector signed int*) &md[am].idx[RIGHT][TAIL];

  vector signed int size_v = {mcb[am].buffer_size[LEFT], mcb[am].buffer_size[RIGHT], mcb[am].buffer_size[OUT], 0};
  vector signed int avail_v = {num_in_buffer(LEFT), num_in_buffer(RIGHT), num_free_in_buffer(OUT), 1};
  vector signed int avail_before = { spu_extract(avail_v, 0), spu_extract(avail_v, 1), 0, 0 };
  vector unsigned int avail = spu_gather( spu_cmpgt(avail_v, zeros) ); // avail = 0x0F if all avail_v > zeros

  vector signed int *left, *right, *out;
  left = (vector signed int*) &md[am].buffer[LEFT][ spu_extract(*left_tail_idx,0) ];
  right = (vector signed int*) &md[am].buffer[RIGHT][ spu_extract(*right_tail_idx,0) ];
  out = (vector signed int*) &md[am].buffer[OUT][ spu_extract(*out_head_idx,0) ];

  #ifdef TRACE_TIME
    dec_val2 = spu_read_decrementer();
  #endif

  while(spu_extract(avail,0) == 0x0F){
    // cmp left and right to determine who gets eaten
    cmp_v = spu_cmpgt(*left,*right);
    cmp_v = spu_shuffle(cmp_v, cmp_v, cmp_v_shuffle_mask);
    // cmp_v = {FFFF,FFFF,FFFF,FFFF} if left[3] > right[3]

    *out = spu_sel(*left,*right,cmp_v);
    rev_mask = spu_sel(rev_right,rev_left,(vector unsigned char)cmp_v);
    *left = spu_shuffle(*left,*right,rev_mask);
    // data to be sorted is now in out and left, left in descending order

    sort_vectors(out,left);

    // update index of the used side
    if( spu_extract(cmp_v,0) ){
      // left[3] > right[3]
      *right_tail_idx = spu_add(*right_tail_idx,ones);
      avail_v = spu_sub(avail_v, one_at_1);
      right++;
      // modulus hack
      cmp_v2 = spu_cmpeq(*right_tail_idx, size_v);
      if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){
	*right_tail_idx = zeros;
	right = (vector signed int*) &md[am].buffer[RIGHT][0];
      }
    } else {
      *right = *left;
      *left_tail_idx = spu_add(*left_tail_idx,ones);
      avail_v = spu_sub(avail_v, one_at_0);
      left++;
      // modulus hack
      cmp_v2 = spu_cmpeq(*left_tail_idx, size_v);      
      if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){	
	*left_tail_idx = zeros;
	left = (vector signed int*) &md[am].buffer[LEFT][0];
      }
    }

    // update out head idx
    *out_head_idx = spu_add(*out_head_idx,ones);
    avail_v = spu_sub(avail_v, one_at_2);
    out++;
    // modulus hack
    cmp_v2 = spu_cmpeq(*out_head_idx, size_v);
    if( __builtin_expect(spu_extract(cmp_v2,0),0) ){
      out = (vector signed int*) &md[am].buffer[OUT][0];
      *out_head_idx = zeros;
    }

    // is there data still available?
    avail = spu_gather(spu_cmpgt(avail_v, zeros));
  }

  #ifdef TRACE_TIME
  merge_loop_ticks += -(spu_read_decrementer() - dec_val2);
  #endif

  // how much got produced?
  vector signed int consumed = spu_sub(avail_before, avail_v);
  int consumed_left = spu_extract(consumed, 0);
  int consumed_right = spu_extract(consumed, 1);

  if(consumed_left)
    update_tail(LEFT);

  if(consumed_right)
    update_tail(RIGHT);

  md[am].consumed[LEFT] += consumed_left;
  md[am].consumed[RIGHT] += consumed_right;
    
  if(md[am].consumed[LEFT] == mcb[am].data_size[LEFT])
    md[am].depleted[LEFT] = 1;
  
  if(md[am].consumed[RIGHT] == mcb[am].data_size[RIGHT])
    md[am].depleted[RIGHT] = 1;

  if(mcb[am].local[OUT] < 255 && md[am].depleted[LEFT] && md[am].depleted[RIGHT]){
    md[am].done = 1;
    --num_active_mergers;
  }
}
Esempio n. 20
0
inline vector float spu_max(vector float a, vector float b)
{
	return spu_sel( b, a, spu_cmpgt( a, b ) );
}
Esempio n. 21
0
Triangle* getTriangleBuffer(Context* context)
{
	// if we've already allocated a triangle buffer (and we're in the same context)
	if (context == _currentTriangleContext && _currentTriangle)
		return _currentTriangle;

	// trash the default values
	_currentTriangleContext	= context;
	_currentTriangle	= NULL;

	// read the current renderable cache line to ensure there is room for the triangle data
	// in the cache line buffer; we do this by comparing against all 16 cache line blocks
	// to make sure that extending the write pointer wouldn't clobber the data

	unsigned long long cache_ea = context->renderableCacheLine;
	if (cache_ea == 0)
		return NULL;
	char cachebuffer[128+127];
	RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 );

	// printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea));

	spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
	spu_readch(MFC_RdAtomicStat);

	// extendvalid = ( read<=write && test<end ) || ( read>write && test<read )
	// extendvalid = ( read>write && read>test ) || ( read<=write && end>test )
	// simplifies to	extendvalid = selb(end, read, read>write) > test
	// or			extendvalid = selb(end>test, read>test, read>write)
	// rewind = next >= end
	// rewindvalid = read != 0
	// valid = extendvalid && (!rewind || rewindvalid)
	// 	 = extendvalid && (!rewind || !rewindinvalid)
	// 	 = extendvalid && !(rewind && rewindinvalid)
	// invalid = ! (extendvalid && !(rewind && rewindinvalid))
	//         = (!extendvalid || (rewind && rewindinvalid))

	vec_ushort8 v_writeptr		= spu_splats( cache->endTriangle );
	vec_ushort8 v_readptr0		= cache->chunkTriangle[0];
	vec_ushort8 v_readptr1		= cache->chunkTriangle[1];
	vec_ushort8 v_testptr		= spu_add(v_writeptr,   TRIANGLE_MAX_SIZE);
	vec_ushort8 v_nextptr		= spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE);
	vec_ushort8 v_endptr		= spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE);

	vec_ushort8 v_zero		= spu_splats( (unsigned short) 0 );
	vec_uchar16 v_merger		= (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 };

	vec_ushort8 v_max0_test		= spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) );
	vec_ushort8 v_max1_test		= spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) );
	vec_ushort8 v_extend0_valid	= spu_cmpgt( v_max0_test, v_testptr );
	vec_ushort8 v_extend1_valid	= spu_cmpgt( v_max1_test, v_testptr );
	vec_ushort8 v_rewind0_invalid	= spu_cmpeq( v_readptr0, v_zero );
	vec_ushort8 v_rewind1_invalid	= spu_cmpeq( v_readptr1, v_zero );
	vec_ushort8 v_rewind8		= spu_cmpgt( v_nextptr, v_endptr );

	vec_uchar16 v_extend_valid	= (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger );
	vec_uchar16 v_rewind_invalid	= (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger );
	vec_uchar16 v_rewind		= (vec_uchar16) v_rewind8;

	vec_uchar16 v_valid_rhs		= spu_and( v_rewind_invalid, v_rewind );
	vec_uchar16 v_invalid		= spu_orc( v_valid_rhs, v_extend_valid );

	// check to see if the chunk is being processed
	vec_uint4 v_free = spu_gather(
		spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );
	vec_uint4   v_invalid_bits	= spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free );

	// if any of the bits are invalid, then no can do
	if ( spu_extract(v_invalid_bits, 0) ) {
		return NULL;
	}

	// fetch in the data before this triangle in the cache buffer
	unsigned int offset = cache->endTriangle;
	_currentTriangleBufferExtra = offset & 127;
	unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127);
	if (_currentTriangleBufferExtra) {
		spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD);

		// ensure DMA did actually complete
		mfc_write_tag_mask(1<<0);
		mfc_read_tag_status_all();
	}

	// final bit of initialisation
	_currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra);
	_currentTriangleOffset = offset;
	_currentTriangleRewind = v_rewind8;
	_currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache));
	_currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); 
	_currentTriangleBufferEA = trianglebuffer_ea; 

	// printf("Allocated new triangle buffer: %x\n", offset);

	// and return the buffer ready to go
	return _currentTriangle;
}
Esempio n. 22
0
inline vector float spu_min(vector float a, vector float b)
{
	return spu_sel( a, b, spu_cmpgt( a, b ) );
}