예제 #1
0
파일: testutils.c 프로젝트: Mashewnutz/Slo
vec_ullong2 bitDiff_d2(vec_double2 ref, vec_double2 vals) {
   double ref0, ref1, vals0, vals1;
   long long refi0, refi1, valsi0, valsi1, diff0, diff1;
   vec_ullong2 bits;

   ref0 = spu_extract(ref,0);
   ref1 = spu_extract(ref,1);
   vals0 = spu_extract(vals,0);
   vals1 = spu_extract(vals,1);

   refi0 = make_ulonglong(ref0);
   refi1 = make_ulonglong(ref1);
   valsi0 = make_ulonglong(vals0);
   valsi1 = make_ulonglong(vals1);

   diff0 = refi0 - valsi0;
   diff1 = refi1 - valsi1;

   if ( diff0 < 0 )
   {
      diff0 = valsi0 - refi0;
   }

   if ( diff1 < 0 )
   {
      diff1 = valsi1 - refi1;
   }

   bits = spu_promote( (unsigned long long)ceil(log2((double)diff0)), 0 );
   bits = spu_insert( (unsigned long long)ceil(log2((double)diff1)), bits, 1 );

   return bits;
}
예제 #2
0
파일: testutils.c 프로젝트: Mashewnutz/Slo
vec_ullong2 ulpDiff_d2(vec_double2 ref, vec_double2 vals) {
   double ref0, ref1, vals0, vals1;
   long long refi0, refi1, valsi0, valsi1, diff0, diff1;
   vec_ullong2 ulps;

   ref0 = spu_extract(ref,0);
   ref1 = spu_extract(ref,1);
   vals0 = spu_extract(vals,0);
   vals1 = spu_extract(vals,1);

   refi0 = make_ulonglong(ref0);
   refi1 = make_ulonglong(ref1);
   valsi0 = make_ulonglong(vals0);
   valsi1 = make_ulonglong(vals1);

   diff0 = refi0 - valsi0;
   diff1 = refi1 - valsi1;

   if ( diff0 < 0 )
   {
      diff0 = valsi0 - refi0;
   }

   if ( diff1 < 0 )
   {
      diff1 = valsi1 - refi1;
   }

   ulps = spu_promote( (unsigned long long)diff0, 0 );
   ulps = spu_insert( (unsigned long long)diff1, ulps, 1 );

   return ulps;
}
예제 #3
0
int cacheGetPrime(int n)
{
    if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart))
    {
        int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4);
        return r;
    }

    // Haal op.

    uint32_t    tag, size;
    tag = mfc_tag_reserve();
    size = CACHE_PRIME_SIZE*16;

    unsigned long long  EA = setup.vPrimes + (n - n%4) * 4;

    mfc_get(&primeCacheData, EA, size, tag, 0, 0);
    mfc_write_tag_mask(1 << tag);
    mfc_read_tag_status_all();
    mfc_tag_release(tag);

    primeCacheStart = n - (n % 4);

    int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4);
    return r;
}
예제 #4
0
int main() {

   int i, j;

   /* The input and output arrays */
   vector float uniform_vec[4];
   vector float normal_vec[4];
   
   /* Generate a seed */
   struct timeval time;
   gettimeofday(&time, NULL);

   /* Generate the random numbers */
   mc_rand_mt_init(time.tv_sec);
   mc_rand_mt_0_to_1_array_f4(4, uniform_vec);
   
   /* Transform the array */
   mc_transform_po_array_f4(4, uniform_vec, normal_vec,
      &mc_rand_mt_0_to_1_f4);

   /* Display the results */
   printf("Uniform Distribution: \n");
   for(i=0; i<4; i++) 
      for(j=0; j<4; j++)
         printf("%f ",spu_extract(uniform_vec[i], j));
   
   printf("\n\nNormal Distribution: \n");
   for(i=0; i<4; i++) 
      for(j=0; j<4; j++)
         printf("%f ",spu_extract(normal_vec[i], j));
   printf("\n");
   return 0;
}
예제 #5
0
/* Calculates  the  length  of  the string s, not including the terminating
 * \0 character.
 */
size_t strlen(const char *s)
{
  size_t len;
  unsigned int cnt, cmp, skip, mask;
  vec_uchar16 *ptr, data;

  /* Compensate for initial mis-aligned string.
   */
  ptr = (vec_uchar16 *)s;
  skip = (unsigned int)(ptr) & 15;
  mask = 0xFFFF >> skip;

  data = *ptr++;
  cmp = spu_extract(spu_gather(spu_cmpeq(data, 0)), 0);
  cmp &= mask;

  cnt = spu_extract(spu_cntlz(spu_promote(cmp, 0)), 0);
  len = cnt - (skip + 16);

  while (cnt == 32) {
    data = *ptr++;
    len -= 16;
    cnt = spu_extract(spu_cntlz(spu_gather(spu_cmpeq(data, 0))), 0);
    len += cnt;
  }

  return (len);
}
예제 #6
0
파일: testutils.c 프로젝트: Mashewnutz/Slo
int allequal_bits_float4( vec_float4 x, vec_float4 y, int tolerance )
{
   vec_uint4 bits = bitDiff_f4( x, y );
   return ( (int)spu_extract(bits,0) <= tolerance && 
            (int)spu_extract(bits,1) <= tolerance &&
            (int)spu_extract(bits,2) <= tolerance &&
            (int)spu_extract(bits,3) <= tolerance );
}
예제 #7
0
파일: testutils.c 프로젝트: Mashewnutz/Slo
int allequal_ulps_float4( vec_float4 x, vec_float4 y, int tolerance )
{
   vec_uint4 ulps = ulpDiff_f4( x, y );
   return ( (int)spu_extract(ulps,0) <= tolerance && 
            (int)spu_extract(ulps,1) <= tolerance &&
            (int)spu_extract(ulps,2) <= tolerance &&
            (int)spu_extract(ulps,3) <= tolerance );
}
예제 #8
0
파일: cacheline.c 프로젝트: ralferoo/spugl
inline void merge_cache_blocks(RenderableCacheLine* cache)
{
    vec_uchar16 next = cache->chunkNext;

    for (;;) {
        vec_uchar16 nextnext = spu_shuffle(next, next, next);
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        vec_ushort8 firstblock0 = spu_cmpeq( cache->chunkStart[0], 0);
        vec_ushort8 firstblock1 = spu_cmpeq( cache->chunkStart[1], 0);
        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 first = (vec_uchar16) spu_shuffle( firstblock0, firstblock1, firstshuf );

        vec_ushort8 tri0 = cache->chunkTriangle[0];
        vec_ushort8 tri1 = cache->chunkTriangle[1];
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 ntri0 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 ntri1 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 trieq0 = spu_cmpeq( tri0, ntri0 );
        vec_ushort8 trieq1 = spu_cmpeq( tri1, ntri1 );

        vec_uchar16 trieq = (vec_uchar16) spu_shuffle( trieq0, trieq1, MERGE );
        vec_uchar16 combi = spu_orc(first, trieq);

        vec_uchar16 canmerge = spu_cmpgt( spu_nor(spu_or(next, nextnext), combi), 256-CHUNKNEXT_BUSY_BIT );

        vec_uint4 gather = spu_gather( canmerge );

        vec_uint4 mergeid = spu_sub( spu_cntlz( gather ), spu_promote((unsigned int)16, 0));

        if( !spu_extract(gather, 0) ) {
            return;
        }

        //	unsigned int firstchunk = spu_extract(mergeid, 0);
        //	unsigned int nextchunk = cache->chunkNextArray[firstchunk];
        vec_uint4 v_chunkNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(mergeid,13) );
        vec_uint4 v_chunkNextNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(v_chunkNext,13) );

        // cache->chunkNextArray[firstchunk] = cache->chunkNextArray[nextchunk];
        next = spu_shuffle( (vec_uchar16) v_chunkNextNext, next, (vec_uchar16) si_cbd( (qword) mergeid, 0 ) );

        // cache->chunkNextArray[nextchunk] = CHUNKNEXT_FREE_BLOCK;
        next = spu_shuffle( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK), next, (vec_uchar16) si_cbd( (qword) v_chunkNext, 0 ) );

        // this is for debug use only, it's not really needed...
        // cache->chunkStartArray[nextchunk] = -1;
        cache->chunkStartArray[ spu_extract(v_chunkNext,0) & 255 ] = -1;

        cache->chunkNext = next;
    }
}
예제 #9
0
void GenerateFrustum()
{
  // g_R2OCon.m_frustum_planes has 6 planes in the following order:
  //
  // 0: near  (there is igg code that assumes this is first)
  // 1: left
  // 2: right
  // 3: bottom
  // 4: top
  // 5: far   (there is igg code that assumes this is last)

  // the plane normals point into the interior of the frustum

  // copy T,B,L,R planes directly
  for (u32 i=1; i<5; i++)
  {
    g_Planes[i] = g_pViewData->m_frustum_planes[i];

    // loosen frustum based on lod, amplitude, and a fudgefactor
    f32 nw = spu_extract(g_Planes[i], 3);
    f32 ny = spu_extract(g_Planes[i], 1);

    //f32 slop = g_R2OCon.m_frustum_fudge_factor * g_WaterObject.m_amplitude * powf(0.5f, 0.5f*(f32)lod)
    //            * (g_R2OCon.m_step * 0.0625f);    // readjust based on changed in lod 0 from 16m to 128m stepsize
    //f32 slop = g_R2OCon.m_frustum_fudge_factor * powf(0.5f, 0.5f*(f32)lod);
    f32 slop = g_R2OCon.m_frustum_fudge_factor * step;

    nw += slop * fabsf(ny);
    g_Planes[i] = spu_insert(nw, g_Planes[i], 3);
  }

  // compute near subfrustum plane based on lod
  //f32 d = g_R2OCon.m_near0 * powf(0.5f, 0.5f*(f32)lod);
  f32 d = g_R2OCon.m_near0 * step * (1.0f/128.0f);

  // test
  if (lod==g_R2OCon.m_num_lods-1)
  {
    d = step;
  }

  vf32 camera_direction = (vf32){0,0,1,0};
  camera_direction = spu_insert(g_pViewData->m_world_to_camera_matrix.m_v0.z, camera_direction, 0);
  camera_direction = spu_insert(g_pViewData->m_world_to_camera_matrix.m_v1.z, camera_direction, 1);
  camera_direction = spu_insert(g_pViewData->m_world_to_camera_matrix.m_v2.z, camera_direction, 2);

  f32 dot = spu_extract(spu_dot3(camera_direction, g_pViewData->m_camera_position), 0);
  f32 w = -(dot + d);

  g_Planes[0] = spu_insert(w, camera_direction, 3);

  // reverse the sense of the near plane
  g_Planes[0] = -g_Planes[0];
}
예제 #10
0
파일: testutils.c 프로젝트: Mashewnutz/Slo
int allequal_llroundf4( llroundf4_t x, llroundf4_t y )
{
   return ( spu_extract(x.vll[0],0) == spu_extract(y.vll[0],0) &&
            spu_extract(x.vll[0],1) == spu_extract(y.vll[0],1) &&
            spu_extract(x.vll[1],0) == spu_extract(y.vll[1],0) &&
            spu_extract(x.vll[1],1) == spu_extract(y.vll[1],1)   );
}
예제 #11
0
int EulerLagrangeLifchitzPrimalityTest(vec_uint4 N[3], int fSophieGermain)
{

    const vec_uint4     one[3] = { { 0,0,0,0 }, { 0,0,0,0 }, { 0,0,0,1 } };
    vec_uint4   r[3];
    vec_uint4   N_1[3];
    vec_uint4   N_1_2[3];

    _mpm_sub(N_1, N, one, 3);
    MPM_SHR_BITS_LARGE(N_1_2, N_1, 3, zero, 1);

    _mpm_mod_exp_2(r, N_1_2, 3, N, 3, 6);

    int nMod8 = spu_extract(N[2], 3) & 7;

         if (fSophieGermain && (nMod8 == 7))
    {
        if (_mpm_cmpeq(r, one, 3)) return 1;
    }
    else if (fSophieGermain && (nMod8 == 3))
    {
        if (_mpm_cmpeq(r, N_1, 3)) return 1;
    }
    else if (!fSophieGermain && (nMod8 == 5))
    {
        if (_mpm_cmpeq(r, N_1, 3)) return 1;
    }
    else if (!fSophieGermain && (nMod8 == 1))
    {
        if (_mpm_cmpeq(r, one, 3)) return 1;
    }

    return 0;
}
예제 #12
0
void writeTriangleBuffer(Triangle* endTriangle)
{
	if (endTriangle != _currentTriangle) {
		int length = ( ((char*)endTriangle) - _currentTriangleBuffer + 127) & ~127;
		unsigned short endTriangleBase = (((char*)endTriangle) - ((char*)_currentTriangle)) + _currentTriangleOffset;
		vec_ushort8 v_new_end = spu_promote(endTriangleBase, 1);

		// calculate genuine next pointer ( rewind==0 -> next, rewind!=0 -> 0 )
		unsigned short next_pointer = spu_extract( spu_andc( v_new_end, _currentTriangleRewind ), 1 );
		_currentTriangle->next_triangle = next_pointer;
		
//		printf("current=0x%x, endTriBase=0x%x, next_pointer=0x%x\n", _currentTriangleOffset, endTriangleBase, next_pointer);

		// DMA the triangle data out
		spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(_currentTriangleBufferEA), mfc_ea2l(_currentTriangleBufferEA), length, 0, MFC_PUT_CMD);

		// update the information in the cache line
		_currentTriangleRewind = spu_splats(next_pointer);		// re-use this variable as we don't need it anymore
		char* dstart = ((char*)&_currentTriangleRewind) + (_currentTriangleCacheEndTriangleEAL & 15);
		spu_mfcdma64(dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL, sizeof(short), 0, MFC_PUTB_CMD);

//		printf("writing from %x to %x:%x\n", dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL);

		// finally invalidate the triangle info
		_currentTriangle = NULL;

		// and make sure the DMA completed
		mfc_write_tag_mask(1<<0);
		mfc_read_tag_status_all();
	}
}
예제 #13
0
unsigned int
__mfc_tag_release (unsigned int tag)
{
  vector unsigned int is_invalid;
  vector unsigned int mask = (vector unsigned int)
	{ 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
  vector signed int zero = (vector signed int) { 0, 0, 0, 0 };

  vector signed int has_been_reserved;

  /* Check if the tag is out of range.  */
  is_invalid = spu_cmpgt (spu_promote (tag, 0), 31);

  /* Check whether the tag has been reserved, set to all 1 if has not
     been reserved, 0 otherwise.  */
  has_been_reserved = (vector signed int) spu_rl (__mfc_tag_table, tag);
  has_been_reserved = (vector signed int) spu_cmpgt (zero, has_been_reserved);

  /* Set invalid.  */
  is_invalid = spu_or ((vector unsigned int) has_been_reserved, is_invalid);

  mask = spu_rlmask (mask, (int)(-tag));
  __mfc_tag_table = spu_or (__mfc_tag_table, mask);

  return spu_extract(is_invalid, 0);
}
예제 #14
0
int num_in_buffer(int side){

  volatile vector signed int *head_idx, *tail_idx;
  int buffer_size;

  if(side == OUT && mcb[am].local[OUT] < 255){
    int parent_idx = mcb[am].local[OUT];
    int side = (mcb[am].id+1)&1;
    head_idx = &md[parent_idx].idx[side][HEAD];
    tail_idx = &md[parent_idx].idx[side][TAIL];
    buffer_size = mcb[parent_idx].buffer_size[side];
  } else {
    head_idx = &md[am].idx[side][HEAD];
    tail_idx = &md[am].idx[side][TAIL];
    buffer_size = mcb[am].buffer_size[side];
  }

  vector signed int diff = spu_sub(*head_idx,*tail_idx);
  int num = spu_extract(diff,0);

  if(num < 0)
    num = num + buffer_size;

  return num;
}
unsigned int
__mfc_multi_tag_reserve (unsigned int number_of_tags)
{
  vector unsigned int table_copy;
  vector unsigned int one = (vector unsigned int)
        { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
  vector unsigned int count_busy, is_valid;
  vector unsigned int count_total;
  vector unsigned int count_avail = (vector unsigned int) { 0, 0, 0, 0 };
  vector unsigned int index = (vector unsigned int) { 0, 0, 0, 0 };

  table_copy = __mfc_tag_table;


  /* count_busy: number of consecutive busy tags
     count_avail: number of consecutive free tags
     table_copy: temporary copy of the tag table
     count_total: sum of count_busy and count_avail
     index: index of the current working tag  */
  do
    {
      table_copy = spu_sl (table_copy, count_avail);

      count_busy = spu_cntlz (table_copy);
      table_copy = spu_sl (table_copy, count_busy);
      count_avail = spu_cntlz (spu_xor(table_copy, -1));
      count_total = spu_add (count_busy, count_avail);
      index = spu_add (index, count_total);
    }
  while (spu_extract (count_avail, 0) < number_of_tags
	 && spu_extract (table_copy, 0) != 0);

  index = spu_sub (index, count_avail);

  /* is_valid is set to 0xFFFFFFFF if table_copy == 0, 0 otherwise.  */
  is_valid = spu_cmpeq (table_copy, 0);
  index = spu_sel (index, is_valid, is_valid);

  /* Now I need to actually mark the tags as used.  */
  table_copy = spu_sl (one, number_of_tags);
  table_copy = spu_rl (table_copy, -number_of_tags - spu_extract (index, 0));
  table_copy = spu_sel (table_copy, __mfc_tag_table, table_copy);
  __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_valid);

  return spu_extract (index, 0);
}
예제 #16
0
파일: zhalfast_f.c 프로젝트: bambang/vsipl
static int
kernel_zfft_f(lwp_functions* pf,
	      void*             params,
	      void*             inout,
	      unsigned int      iter,
	      unsigned int      iter_max)
{
  static fft1d_f* obj;

  Fft_split_params* fftp = (Fft_split_params *)params;
  unsigned int      size   = fftp->size;
  unsigned int      chunks = fftp->chunks_per_wb;
  unsigned int      i;
  int dir = fftp->direction == fwd_fft ? CML_FFT_FWD : CML_FFT_INV;

  if (iter == iter_max-1 && iter_max * chunks > fftp->chunks_per_spe)
    chunks = fftp->chunks_per_spe % chunks;

  assert(size >= MIN_FFT_1D_SIZE);
  assert(size <= MAX_FFT_1D_SIZE);

  if (iter == 0 && size != current_size)
  {
#if !NDEBUG
    // Check that buffer space doesn't overlap with stack
    register volatile vector unsigned int get_r1 asm("1");
    unsigned int stack_pointer   = spu_extract(get_r1, 0);
    assert(buf + 2*MAX_FFT_1D_SIZE + size + 128/4 < stack_pointer);
#endif
    int rt = cml_fft1d_setup_f(&obj, CML_FFT_CC, size,
			       buf + 2*MAX_FFT_1D_SIZE);
    assert(rt && obj != NULL);
    current_size = size;
  }

  float* inout_re  = (float*)inout + 0        * size;
  float* inout_im  = (float*)inout + 1*chunks * size;

  for (i=0; i<chunks; ++i)
  {
    cml_zzfft1d_op_f(obj,
		     (float*)inout_re  + i*size, (float*)inout_im  + i*size,
		     (float*)inout_re  + i*size, (float*)inout_im  + i*size,
		     dir, buf);
  }

  if (fftp->scale != (double)1.f)
  {
    // Instead of regular split svmul:
    // cml_core_rzsvmul1_f(fftp->scale, out_re,out_im,out_re,out_im,size);
    // Take advantage of real and imag being contiguous:
    cml_core_svmul1_f(fftp->scale, inout_re, inout_re, 2*size*chunks);
  }

  return 0;
}
예제 #17
0
void *
sbrk (ptrdiff_t increment)
{
	static caddr_t heap_ptr = NULL;
	caddr_t base;
	vector unsigned int sp_reg, sp_delta;
	vector unsigned int *sp_ptr;
	caddr_t sps;

	/* The stack pointer register.  */
	volatile register vector unsigned int sp_r1 __asm__("1");
	
	if (heap_ptr == NULL)
	  heap_ptr = (caddr_t) & _end;
	
	sps = (caddr_t) spu_extract (sp_r1, 0);
	if (((int) sps - STACKSIZE - (int) heap_ptr) >= increment)
	  {
	    base = heap_ptr;
	    heap_ptr += increment;
	    
	    sp_delta = (vector unsigned int) spu_insert (increment, spu_splats (0), 1);

	    /* Subtract sp_delta from the SP limit (word 1).  */
	    sp_r1 = spu_sub (sp_r1, sp_delta);
	    
	    /* Fix-up backchain.  */
	    sp_ptr = (vector unsigned int *) spu_extract (sp_r1, 0);
	    do
	      {
		sp_reg = *sp_ptr;
		*sp_ptr = (vector unsigned int) spu_sub (sp_reg, sp_delta);
	      }
	    while ((sp_ptr = (vector unsigned int *) spu_extract (sp_reg, 0)));

	    return (base);
	  }
	else
	  {
	    errno = ENOMEM;
	    return ((void *) -1);
	  }
}
예제 #18
0
static btVector3 convexHullSupport (const btVector3& localDirOrg, const btVector3* points, int numPoints, const btVector3& localScaling)
{	

	btVector3 vec = localDirOrg * localScaling;

#if defined (__CELLOS_LV2__) && defined (__SPU__)

	btVector3 localDir = vec;

	vec_float4 v_distMax = {-FLT_MAX,0,0,0};
	vec_int4 v_idxMax = {-999,0,0,0};
	int v=0;
	int numverts = numPoints;

	for(;v<(int)numverts-4;v+=4) {
		vec_float4 p0 = vec_dot3(points[v  ].get128(),localDir.get128());
		vec_float4 p1 = vec_dot3(points[v+1].get128(),localDir.get128());
		vec_float4 p2 = vec_dot3(points[v+2].get128(),localDir.get128());
		vec_float4 p3 = vec_dot3(points[v+3].get128(),localDir.get128());
		const vec_int4 i0 = {v  ,0,0,0};
		const vec_int4 i1 = {v+1,0,0,0};
		const vec_int4 i2 = {v+2,0,0,0};
		const vec_int4 i3 = {v+3,0,0,0};
		vec_uint4  retGt01 = spu_cmpgt(p0,p1);
		vec_float4 pmax01 = spu_sel(p1,p0,retGt01);
		vec_int4   imax01 = spu_sel(i1,i0,retGt01);
		vec_uint4  retGt23 = spu_cmpgt(p2,p3);
		vec_float4 pmax23 = spu_sel(p3,p2,retGt23);
		vec_int4   imax23 = spu_sel(i3,i2,retGt23);
		vec_uint4  retGt0123 = spu_cmpgt(pmax01,pmax23);
		vec_float4 pmax0123 = spu_sel(pmax23,pmax01,retGt0123);
		vec_int4   imax0123 = spu_sel(imax23,imax01,retGt0123);
		vec_uint4  retGtMax = spu_cmpgt(v_distMax,pmax0123);
		v_distMax = spu_sel(pmax0123,v_distMax,retGtMax);
		v_idxMax = spu_sel(imax0123,v_idxMax,retGtMax);
	}
	for(;v<(int)numverts;v++) {
		vec_float4 p = vec_dot3(points[v].get128(),localDir.get128());
		const vec_int4 i = {v,0,0,0};
		vec_uint4  retGtMax = spu_cmpgt(v_distMax,p);
		v_distMax = spu_sel(p,v_distMax,retGtMax);
		v_idxMax = spu_sel(i,v_idxMax,retGtMax);
	}
	int ptIndex = spu_extract(v_idxMax,0);
	const btVector3& supVec= points[ptIndex] * localScaling;
	return supVec;
#else

    btScalar maxDot;
    long ptIndex = vec.maxDot( points, numPoints, maxDot);
	btAssert(ptIndex >= 0);
	btVector3 supVec = points[ptIndex] * localScaling;
	return supVec;
#endif //__SPU__
}
예제 #19
0
int main(int argc, char **argv) {

   int i, j;
   vector unsigned int int_vec;
   vector float float_vec;
   vector double double_vec[4];
   
   /* Get the current time to use as seed */
   struct timeval time;
   gettimeofday(&time, NULL);

   /* The hardware number generator */
   printf("\nHardware Generator:\n");   
   if(mc_rand_hw_init() == 0) {
      int_vec = mc_rand_hw_u4();
      for(i=0; i<4; i++)
         printf("%u ",spu_extract(int_vec, i));
      printf("\n\n");
   }
   else
      printf("Hardware RNG is not available.\n\n");
   
   /* The Kirkpatrick Stoll PRNG */
   mc_rand_ks_init(time.tv_sec);
   float_vec = mc_rand_ks_0_to_1_f4();
   printf("Kirkpatrick-Stoll:\n");
   for(i=0; i<4; i++)
      printf("%f ", spu_extract(float_vec, i));
   printf("\n\n");
   
   /* The Mersenne Twister PRNG */
   mc_rand_mt_init(time.tv_sec);
   mc_rand_mt_minus1_to_1_array_d2(4, double_vec);
   printf("Mersenne Twister:\n");
   for(i=0; i<4; i++) {
      for(j=0; j<2; j++)
         printf("%g ",spu_extract(double_vec[i], j));
      printf("\n");
   }
   
   return 0;
}
예제 #20
0
int main(int argc, char **argv) {
   int i;
   vector unsigned int all_ones = (vector unsigned int) 
      {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};

   vector unsigned int all_zeroes = (vector unsigned int) 
      {0x00000000, 0x00000000, 0x00000000, 0x00000000};
   
   /* These bits will form the selection mask */
   unsigned short mask = 0x9;
   
   /* Each bit in 0x9 forms a word in the mask */
   vector unsigned int resultw = 
      spu_sel(all_zeroes, all_ones, spu_maskw(mask));
   printf("resultw: ");
   for (i=0; i<4; i++) {
      printf("%08x", spu_extract(resultw, i));
   }
   
   /* Each bit in 0x09 forms a halfword in the mask */
   vector unsigned short resulth = 
      spu_sel((vector unsigned short)all_zeroes, 
              (vector unsigned short)all_ones, 
              spu_maskh(mask));
   printf("\nresulth: ");
   for (i=0; i<8; i++) {
      printf("%04x", spu_extract(resulth, i));
   }

   /* Each bit in 0x0009 forms a byte in the mask */
   vector unsigned char resultb = 
      spu_sel((vector unsigned char)all_zeroes, 
              (vector unsigned char)all_ones, 
              spu_maskb(mask));
   printf("\nresultb: ");
   for (i=0; i<16; i++) {
      printf("%02x", spu_extract(resultb, i));
   }
   printf("\n");
   return 0;
}
예제 #21
0
void cp_buffer(int side){
  int avail_out = num_free_in_buffer(OUT);
  int avail_side = num_in_buffer(side);
  int max = avail_out < avail_side ? avail_out : avail_side;

  vector signed int *out_head;
  if(mcb[am].local[OUT] < 255)
    out_head = (vector signed int*) &md[ mcb[am].local[OUT] ].idx[ (mcb[am].id+1)&1 ][HEAD];
  else
    out_head = (vector signed int*) &md[am].idx[OUT][HEAD];

  vector unsigned int cmp_v;
  vector signed int from_size = spu_splats( mcb[am].buffer_size[side] );
  vector signed int out_size = spu_splats( mcb[ mcb[am].local[OUT] ].buffer_size[ (mcb[am].id+1)&1 ] );
  vector signed int ones = {1,1,1,1};
  vector signed int zeros = {0,0,0,0};

  int i;
  for(i = 0; i < max; i++){
    md[am].buffer[OUT][spu_extract( *out_head,0)] = md[am].buffer[side][spu_extract(md[am].idx[side][TAIL],0)];
    // update idx
    md[am].idx[side][TAIL] = spu_add(md[am].idx[side][TAIL], ones);
    cmp_v = spu_cmpeq(md[am].idx[side][TAIL],from_size);
    md[am].idx[side][TAIL] = spu_sel(md[am].idx[side][TAIL], zeros, cmp_v);

    *out_head = spu_add(*out_head,ones);
    cmp_v = spu_cmpeq(*out_head, out_size);
    *out_head = spu_sel(*out_head,zeros,cmp_v);
  }

  update_tail(side);

  md[am].consumed[side] += max;

  if(mcb[am].local[OUT] < 255 && md[am].consumed[side] == mcb[am].data_size[side]){
    md[am].depleted[side] = 1;
    md[am].done = 1;
    --num_active_mergers;
  }
}
예제 #22
0
void R2O_CutHoles(u8 *outcodes, i32 width, i32 height, u32 lod)
{
  // get window extents
  f32 x_min = spu_extract(origin_world, 0);
  f32 z_min = spu_extract(origin_world, 2);
  f32 x_max = x_min + (f32)width  * step;
  f32 z_max = z_min + (f32)height * step;

  // make sure we cut the right water object
  vu32 coords_u32 = *(vu32 *)&g_WaterObject.m_origin;
  u32 tag = spu_extract(coords_u32, 0) ^ spu_extract(coords_u32, 2);

  R2OHole *p_hole = g_Holes;
  for (u32 i=0; i<g_R2OCon.m_num_holes; i++, p_hole++)
  {
    if (tag==p_hole->m_tag && lod==p_hole->m_lod)
    {
      // set start of hole
      f32 x = p_hole->m_xcoord;
      f32 z = p_hole->m_zcoord;

      // set coords deltas
      f32 dx = p_hole->m_dir ? 0.0f : 2.0f*step;
      f32 dz = p_hole->m_dir ? 2.0f*step : 0.0f;

      // loop over length of hole
      for (u32 l=0; l<p_hole->m_cnt; l++)
      {
        // test for overlap with the current window
        if (x>=x_min && x<x_max && z>=z_min && z<z_max)
        {
          // translate coords to col/row values
          i32 c = (i32)((x-x_min) * inv_step);
          i32 r = (i32)((z-z_min) * inv_step);

          // punch out the hole
          outcodes[r*width + c] |= 0x80;

          // test camera against hole
          f32 x0 = x - step;
          f32 x1 = x + step;
          f32 z0 = z - step;
          f32 z1 = z + step;
          f32 x_cam = spu_extract(g_pViewData->m_camera_position, 0);
          f32 z_cam = spu_extract(g_pViewData->m_camera_position, 2);
          if (x_cam>=x0 && x_cam<x1 && z_cam>=z0 && z_cam<z1)
          {
            g_RenderData.m_b_camera_over_water = false;
          }
        }

        // step coords
        x += dx;
        z += dz;
      }
    }
  }
}
예제 #23
0
int 
main(unsigned long long id) {
  vector unsigned int x = get_vector_param_3();
  vector unsigned int count  = (vector unsigned int){0,0,0,0};
  vector unsigned int result = (vector unsigned int){0,0,0,0};

  spu_ready();

  count  = popc(x);
  result = reduce_word(count);
  
  spu_write_out_mbox(spu_extract(result, 0));

  return SPU_SUCCESS;
}
예제 #24
0
int
strncmp_ea (__ea void *s1, __ea const void *s2, size_ea_t n3)
{
  __ea void *curr_s1 = (__ea void *) s1;
  __ea void *curr_s2 = (__ea void *) s2;
  void *l_s1;
  void *l_s2;
  int min;
  size_ea_t s2_n;
  size_ea_t s1_n;
  int ret;
  vec_uint4 end_v;

  ret = 0;			/* in case n3 is 0 */
  while (n3)
    {
      l_s2 = __cache_fetch (curr_s2);
      l_s1 = __cache_fetch (curr_s1);

      /*
       * Use the smaller of the size left to compare (n3), the space left in
       * s2 cacheline (s2_n), or the space left in the s1 cacheline (s1_n)
       */
      s2_n = ROUND_UP_NEXT_128 ((size_ea_t) curr_s2) - (size_ea_t) curr_s2;
      s1_n = ROUND_UP_NEXT_128 ((size_ea_t) curr_s1) - (size_ea_t) curr_s1;
      min = three_way_min (s2_n, s1_n, n3);

      ret = _strncmp_internal (l_s1, l_s2, min, &end_v, 1);
      /*
       * Only the first slot of end_v is set.
       */
      /* if (ret || spu_extract(spu_cmpeq(end_v, 0), 0)) { */
      /* if (ret || spu_extract(spu_gather(spu_cmpeq(end_v, 0)), 0)) { */
      if (ret || spu_extract (end_v, 0))
	/*
	 * If any NUL values were seen (end_v values of zero) we still have
	 * to return ret, as it might not be zero.
	 */
	return ret;

      curr_s2 += min;
      curr_s1 += min;
      n3 -= min;
    }
  return ret;
}
예제 #25
0
unsigned int
__mfc_tag_reserve (void)
{
  vector unsigned int mask = (vector unsigned int)
	{ 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
  vector unsigned int count_zeros, is_valid;
  vector signed int count_neg;

  count_zeros = spu_cntlz (__mfc_tag_table);
  count_neg = spu_sub (0, (vector signed int) count_zeros);

  mask = spu_rlmask (mask, (vector signed int) count_neg);
  __mfc_tag_table = spu_andc (__mfc_tag_table, mask);

  is_valid = spu_cmpeq (count_zeros, 32);
  count_zeros = spu_sel (count_zeros, is_valid, is_valid);

  return spu_extract (count_zeros, 0);
}
예제 #26
0
void pull(int side){
  int avail_in = num_free_in_buffer(side);
  int avail_mm = mcb[am].data_size[side] - md[am].num_pulled[side];
  int num_pull = avail_in < avail_mm ? avail_in : avail_mm;
  num_pull = num_pull < MAX_DMA_SIZE ? num_pull : MAX_DMA_SIZE;
  int head = spu_extract(md[am].idx[side][HEAD],0);
  int avail_from_head = mcb[am].buffer_size[side] - head;
  int first_pull = num_pull < avail_from_head ? num_pull : avail_from_head;
        
  if(!first_pull)
    return;

  // pull #first_pull
  unsigned int to_ea = (unsigned int) &md[am].buffer[side][head];
  int tag = mfc_tag_reserve();
  if(tag == MFC_TAG_INVALID){
    return;
  } else {
    md[am].held_tag[side] = tag;
  }

  mfc_get((void*)to_ea,
	  mcb[am].block_addr[side],
	  first_pull * sizeof(vector signed int),
	  md[am].held_tag[side],
	  0,0);
  mcb[am].block_addr[side] += first_pull * sizeof(vector signed int);

  if(first_pull < num_pull){
    to_ea = (unsigned int) &md[am].buffer[side][0];
    int second_pull = num_pull - first_pull;

    mfc_get((void*)to_ea,
	    mcb[am].block_addr[side],
	    second_pull * sizeof(vector signed int),
	    md[am].held_tag[side],
	    0,0);
    mcb[am].block_addr[side] += second_pull * sizeof(vector signed int);
  }

  md[am].num_waiting[side] = num_pull;
}
예제 #27
0
unsigned int
__mfc_multi_tag_release (unsigned int first_tag, unsigned int number_of_tags)
{
  vector unsigned int table_copy, tmp, tmp1;
  vector unsigned int one = (vector unsigned int)
        { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
  vector unsigned int is_invalid;
  unsigned int last_tag;
  vector unsigned int has_been_reserved;

  last_tag = first_tag + number_of_tags;

  table_copy = spu_sl (one, number_of_tags);
  table_copy = spu_rl (table_copy, -last_tag);
  table_copy = spu_xor (table_copy, -1);

  /* Make sure the tags are in range and valid.  */
  tmp = spu_cmpgt (spu_promote(last_tag, 0), 32);
  tmp1 = spu_cmpgt (spu_promote(number_of_tags, 0), 32);
  is_invalid =  spu_cmpgt (spu_promote(first_tag, 0), 31);

  /* All bits are set to 1 if invalid, 0 if valid.  */
  is_invalid = spu_or (tmp, is_invalid);
  is_invalid = spu_or (tmp1, is_invalid);

  /* check whether these tags have been reserved */
  tmp = spu_rlmask (one, (int)-number_of_tags);
  tmp1 = spu_sl (__mfc_tag_table, first_tag);
  has_been_reserved = spu_cmpgt(tmp1, tmp);

  is_invalid = spu_or (has_been_reserved, is_invalid);

  table_copy = spu_sel (__mfc_tag_table, table_copy, table_copy);
  __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_invalid);

  return spu_extract (is_invalid, 0);
}
예제 #28
0
/**
 * Setup fragment shader inputs by evaluating triangle's vertex
 * attribute coefficient info.
 * \param x  quad x pos
 * \param y  quad y pos
 * \param fragZ  returns quad Z values
 * \param fragInputs  returns fragment program inputs
 * Note: this code could be incorporated into the fragment program
 * itself to avoid the loop and switch.
 */
static void
eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
{
   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
   static const vector float deltaY = (const vector float) {0, 0, 1, 1};

   const uint posSlot = 0;
   const vector float pos = setup.coef[posSlot].a0;
   const vector float dposdx = setup.coef[posSlot].dadx;
   const vector float dposdy = setup.coef[posSlot].dady;
   const vector float fragX = spu_splats(x) + deltaX;
   const vector float fragY = spu_splats(y) + deltaY;
   vector float fragW, wInv;
   uint i;

   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
   wInv = spu_re(fragW);  /* 1 / w */

   /* loop over fragment program inputs */
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      uint attr = i + 1;
      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;

      /* constant term */
      vector float a0 = setup.coef[attr].a0;
      vector float r0 = splatx(a0);
      vector float r1 = splaty(a0);
      vector float r2 = splatz(a0);
      vector float r3 = splatw(a0);

      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
         /* linear term */
         vector float dadx = setup.coef[attr].dadx;
         vector float dady = setup.coef[attr].dady;
         /* Use SPU intrinsics here to get slightly better code.
          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
          */
         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
         if (interp == INTERP_PERSPECTIVE) {
            /* perspective term */
            r0 *= wInv;
            r1 *= wInv;
            r2 *= wInv;
            r3 *= wInv;
         }
      }
      fragInputs[CHAN0] = r0;
      fragInputs[CHAN1] = r1;
      fragInputs[CHAN2] = r2;
      fragInputs[CHAN3] = r3;
      fragInputs += 4;
   }
}


/**
 * Emit a quad (pass to next stage).  No clipping is done.
 * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 * should be skipped.  But adding the test for that slows things down
 * overall.
 */
static INLINE void
emit_quad( int x, int y, mask_t mask)
{
   /* If any bits in mask are set... */
   if (spu_extract(spu_orx(mask), 0)) {
      const int ix = x - setup.cliprect_minx;
      const int iy = y - setup.cliprect_miny;

      spu.cur_ctile_status = TILE_STATUS_DIRTY;
      spu.cur_ztile_status = TILE_STATUS_DIRTY;

      {
         /*
          * Run fragment shader, execute per-fragment ops, update fb/tile.
          */
         vector float inputs[4*4], outputs[2*4];
         vector unsigned int kill_mask;
         vector float fragZ;

         eval_inputs((float) x, (float) y, &fragZ, inputs);

         ASSERT(spu.fragment_program);
         ASSERT(spu.fragment_ops);

         /* Execute the current fragment program */
         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);

         mask = spu_andc(mask, kill_mask);

         /* Execute per-fragment/quad operations, including:
          * alpha test, z test, stencil test, blend and framebuffer writing.
          * Note that there are two different fragment operations functions
          * that can be called, one for front-facing fragments, and one
          * for back-facing fragments.  (Often the two are the same;
          * but in some cases, like two-sided stenciling, they can be
          * very different.)  So choose the correct function depending
          * on the calculated facing.
          */
         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                          fragZ,
                          outputs[0*4+0],
                          outputs[0*4+1],
                          outputs[0*4+2],
                          outputs[0*4+3],
                          mask);
      }
   }
}


/**
 * Given an X or Y coordinate, return the block/quad coordinate that it
 * belongs to.
 */
static INLINE int
block(int x)
{
   return x & ~1;
}


/**
 * Render a horizontal span of quads
 */
static void
flush_spans(void)
{
   int minleft, maxright;

   const int l0 = spu_extract(setup.span.quad, 0);
   const int l1 = spu_extract(setup.span.quad, 1);
   const int r0 = spu_extract(setup.span.quad, 2);
   const int r1 = spu_extract(setup.span.quad, 3);

   switch (setup.span.y_flags) {
   case 0x3:
      /* both odd and even lines written (both quad rows) */
      minleft = MIN2(l0, l1);
      maxright = MAX2(r0, r1);
      break;

   case 0x1:
      /* only even line written (quad top row) */
      minleft = l0;
      maxright = r0;
      break;

   case 0x2:
      /* only odd line written (quad bottom row) */
      minleft = l1;
      maxright = r1;
      break;

   default:
      return;
   }

   /* OK, we're very likely to need the tile data now.
    * clear or finish waiting if needed.
    */
   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
      /* wait for mfc_get() to complete */
      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
      wait_on_mask(1 << TAG_READ_TILE_COLOR);
      spu.cur_ctile_status = TILE_STATUS_CLEAN;
   }
   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
      clear_c_tile(&spu.ctile);
      spu.cur_ctile_status = TILE_STATUS_DIRTY;
   }
   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);

   if (spu.read_depth_stencil) {
      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
         /* wait for mfc_get() to complete */
         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
         wait_on_mask(1 << TAG_READ_TILE_Z);
         spu.cur_ztile_status = TILE_STATUS_CLEAN;
      }
      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
         clear_z_tile(&spu.ztile);
         spu.cur_ztile_status = TILE_STATUS_DIRTY;
      }
      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
   }

   /* XXX this loop could be moved into the above switch cases... */
   
   /* Setup for mask calculation */
   const vec_int4 quad_LlRr = setup.span.quad;
   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));

   const vec_int4 twos = spu_splats(2);

   const int x = block(minleft);
   vec_int4 xs = {x, x+1, x, x+1};

   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
      /**
       * Computes mask to indicate which pixels in the 2x2 quad are actually
       * inside the triangle's bounds.
       */
      
      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
      
      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);

      /* Combine results to create mask */
      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);

      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
   }

   setup.span.y = 0;
   setup.span.y_flags = 0;
   /* Zero right elements */
   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
}


#if DEBUG_VERTS
static void
print_vertex(const struct vertex_header *v)
{
   uint i;
   fprintf(stderr, "  Vertex: (%p)\n", v);
   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
              spu_extract(v->data[i], 0),
              spu_extract(v->data[i], 1),
              spu_extract(v->data[i], 2),
              spu_extract(v->data[i], 3));
   }
}
예제 #29
0
void ClipToRectangle(vf32 clip_min, vf32 clip_max)
{
  // convert from world coords to integer rows & cols
  vf32 norm_min = spu_mul(clip_min - origin_world, spu_splats(inv_step));
  vf32 norm_max = spu_mul(clip_max - origin_world, spu_splats(inv_step));
  vi32 int_min  = VecFloor4(norm_min);
  vi32 int_max  = VecCeil4 (norm_max);

  // expand rectangle by 1 gridpoint because quads incident to the verts we're about to cull out will also be culled out
  i32 c_min = spu_extract(int_min, 0) - 1;
  i32 c_max = spu_extract(int_max, 0) + 1;
  i32 r_min = spu_extract(int_min, 2) - 1;
  i32 r_max = spu_extract(int_max, 2) + 1;

  // trim loop bounds to rectangle so we don't splat memory
  i32 c0 = c_min >= 0 ? c_min   : 0;
  i32 c1 = c_max < nc ? c_max+1 : nc;
  i32 r0 = r_min >= 0 ? r_min   : 0;
  i32 r1 = r_max < nr ? r_max+1 : nr;


  // cull left points
  if (c_min>=0 && c_min<nc)
  {
    u8 *p = &g_Outcodes[r0*nc+c_min];
    for (i32 r=r0; r<r1; r++,p+=nc)
    {
      *p |= 0x80;
    }
  }

  // cull right points
  if (c_max>=0 && c_max<nc)
  {
    u8 *p = &g_Outcodes[r0*nc+c_max];
    for (i32 r=r0; r<r1; r++,p+=nc)
    {
      *p |= 0x80;
    }
  }

  // cull upper points
  if (r_min>=0 && r_min<nr)
  {
    u8 *p = &g_Outcodes[r_min*nc+c0];
    for (i32 c=c0; c<c1; c++,p++)
    {
      *p |= 0x80;
    }
  }

  // cull lower points
  if (r_max>=0 && r_max<nr)
  {
    u8 *p = &g_Outcodes[r_max*nc+c0];
    for (i32 c=c0; c<c1; c++,p++)
    {
      *p |= 0x80;
    }
  }
}
예제 #30
0
void InitBasisEtc()
{
  // Use a fixed initial step size for now; 128m for lod 0.
  // This yields an fft tile size of 32 x 128m = 4096m for lod 0
  // and maximum dimensions of 8192m x 8192m
  step = g_R2OCon.m_step;
  vf32 step_vec = (vf32){step, 0, step, 0};
  
  // get inverse-step using float magic (since taking the reciprocal of a power of 2 yields a 1-bit error)
  qword q_step  = si_from_float(step);
  qword q_magic = si_ilhu(0x7F00);
  inv_step = si_to_float(si_sf(q_step, q_magic));

  // set clip window
  clip_min = g_WaterObject.m_origin;
  clip_max = g_WaterObject.m_origin + g_WaterObject.m_dimensions;

  // set origin at gridpoint below clip min
  f32 magic_float = 1.5f * 8388608.0f * step;
  vf32 magic_vf32 = (vf32){magic_float, 0, magic_float, 0};
  origin_world = (clip_min + magic_vf32) - magic_vf32;

  // compute gridpoint above clip max
  vf32 max_corner = (clip_max + magic_vf32) - magic_vf32;
  max_corner += step_vec;

  // offset both corners by the necessary amount of padding
  origin_world -= step_vec * spu_splats(8.0f);
  max_corner   += step_vec * spu_splats(8.0f);

  // set num cols & num rows
  vf32 dims = max_corner - origin_world;
  nc = (i32)(spu_extract(dims,0) * inv_step) + 1;
  nr = (i32)(spu_extract(dims,2) * inv_step) + 1;

  // record true nc, nr
  true_nc = nc - 16;
  true_nr = nr - 16;

  // alignment requirements (ooh, that's a bit strict)
  nc = (nc + 7) & -8;
  nr = (nr + 7) & -8;

  // deal with large grids
  if (nc > 80)
  {
    nc = 80;
    true_nc = 64;
    dims = spu_insert((nc-1)*step, dims, 0);
  }
  if (nr > 80)
  {
    nr = 80;
    true_nr = 64;
    dims = spu_insert((nr-1)*step, dims, 2);
  }
  max_corner = origin_world + dims;


  even_step = step;
  even_inv_step = inv_step;
  even_basis_col = (vf32){1.0f, 0.0f, 0.0f, 0.0f};
  even_basis_row = (vf32){0.0f, 0.0f, 1.0f, 0.0f};

  const f32 r = 0.707106781187f;
  odd_step    = even_step * r;
  odd_inv_step= even_inv_step * r * 2.0f;
  odd_basis_col = (vf32){ r, 0.0f, r, 0.0f};
  odd_basis_row = (vf32){-r, 0.0f, r, 0.0f};

  basis_col = even_basis_col;
  basis_row = even_basis_row;
  dvc_world = spu_splats(step) * basis_col;
  dvr_world = spu_splats(step) * basis_row;

  // set base lod origin
  g_RenderData.m_origins[0]   = origin_world;
  g_RenderData.m_cols_rows[0] = nc<<8 | nr;
  c0_amb = 0;
  r0_amb = 0;

  SetBasisEtc(0,0);
}