vec_ullong2 bitDiff_d2(vec_double2 ref, vec_double2 vals) { double ref0, ref1, vals0, vals1; long long refi0, refi1, valsi0, valsi1, diff0, diff1; vec_ullong2 bits; ref0 = spu_extract(ref,0); ref1 = spu_extract(ref,1); vals0 = spu_extract(vals,0); vals1 = spu_extract(vals,1); refi0 = make_ulonglong(ref0); refi1 = make_ulonglong(ref1); valsi0 = make_ulonglong(vals0); valsi1 = make_ulonglong(vals1); diff0 = refi0 - valsi0; diff1 = refi1 - valsi1; if ( diff0 < 0 ) { diff0 = valsi0 - refi0; } if ( diff1 < 0 ) { diff1 = valsi1 - refi1; } bits = spu_promote( (unsigned long long)ceil(log2((double)diff0)), 0 ); bits = spu_insert( (unsigned long long)ceil(log2((double)diff1)), bits, 1 ); return bits; }
vec_ullong2 ulpDiff_d2(vec_double2 ref, vec_double2 vals) { double ref0, ref1, vals0, vals1; long long refi0, refi1, valsi0, valsi1, diff0, diff1; vec_ullong2 ulps; ref0 = spu_extract(ref,0); ref1 = spu_extract(ref,1); vals0 = spu_extract(vals,0); vals1 = spu_extract(vals,1); refi0 = make_ulonglong(ref0); refi1 = make_ulonglong(ref1); valsi0 = make_ulonglong(vals0); valsi1 = make_ulonglong(vals1); diff0 = refi0 - valsi0; diff1 = refi1 - valsi1; if ( diff0 < 0 ) { diff0 = valsi0 - refi0; } if ( diff1 < 0 ) { diff1 = valsi1 - refi1; } ulps = spu_promote( (unsigned long long)diff0, 0 ); ulps = spu_insert( (unsigned long long)diff1, ulps, 1 ); return ulps; }
int cacheGetPrime(int n) { if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart)) { int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; } // Haal op. uint32_t tag, size; tag = mfc_tag_reserve(); size = CACHE_PRIME_SIZE*16; unsigned long long EA = setup.vPrimes + (n - n%4) * 4; mfc_get(&primeCacheData, EA, size, tag, 0, 0); mfc_write_tag_mask(1 << tag); mfc_read_tag_status_all(); mfc_tag_release(tag); primeCacheStart = n - (n % 4); int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; }
int main() { int i, j; /* The input and output arrays */ vector float uniform_vec[4]; vector float normal_vec[4]; /* Generate a seed */ struct timeval time; gettimeofday(&time, NULL); /* Generate the random numbers */ mc_rand_mt_init(time.tv_sec); mc_rand_mt_0_to_1_array_f4(4, uniform_vec); /* Transform the array */ mc_transform_po_array_f4(4, uniform_vec, normal_vec, &mc_rand_mt_0_to_1_f4); /* Display the results */ printf("Uniform Distribution: \n"); for(i=0; i<4; i++) for(j=0; j<4; j++) printf("%f ",spu_extract(uniform_vec[i], j)); printf("\n\nNormal Distribution: \n"); for(i=0; i<4; i++) for(j=0; j<4; j++) printf("%f ",spu_extract(normal_vec[i], j)); printf("\n"); return 0; }
/* Calculates the length of the string s, not including the terminating * \0 character. */ size_t strlen(const char *s) { size_t len; unsigned int cnt, cmp, skip, mask; vec_uchar16 *ptr, data; /* Compensate for initial mis-aligned string. */ ptr = (vec_uchar16 *)s; skip = (unsigned int)(ptr) & 15; mask = 0xFFFF >> skip; data = *ptr++; cmp = spu_extract(spu_gather(spu_cmpeq(data, 0)), 0); cmp &= mask; cnt = spu_extract(spu_cntlz(spu_promote(cmp, 0)), 0); len = cnt - (skip + 16); while (cnt == 32) { data = *ptr++; len -= 16; cnt = spu_extract(spu_cntlz(spu_gather(spu_cmpeq(data, 0))), 0); len += cnt; } return (len); }
int allequal_bits_float4( vec_float4 x, vec_float4 y, int tolerance ) { vec_uint4 bits = bitDiff_f4( x, y ); return ( (int)spu_extract(bits,0) <= tolerance && (int)spu_extract(bits,1) <= tolerance && (int)spu_extract(bits,2) <= tolerance && (int)spu_extract(bits,3) <= tolerance ); }
int allequal_ulps_float4( vec_float4 x, vec_float4 y, int tolerance ) { vec_uint4 ulps = ulpDiff_f4( x, y ); return ( (int)spu_extract(ulps,0) <= tolerance && (int)spu_extract(ulps,1) <= tolerance && (int)spu_extract(ulps,2) <= tolerance && (int)spu_extract(ulps,3) <= tolerance ); }
inline void merge_cache_blocks(RenderableCacheLine* cache) { vec_uchar16 next = cache->chunkNext; for (;;) { vec_uchar16 nextnext = spu_shuffle(next, next, next); vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); vec_ushort8 firstblock0 = spu_cmpeq( cache->chunkStart[0], 0); vec_ushort8 firstblock1 = spu_cmpeq( cache->chunkStart[1], 0); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 first = (vec_uchar16) spu_shuffle( firstblock0, firstblock1, firstshuf ); vec_ushort8 tri0 = cache->chunkTriangle[0]; vec_ushort8 tri1 = cache->chunkTriangle[1]; vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 ntri0 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 ntri1 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 trieq0 = spu_cmpeq( tri0, ntri0 ); vec_ushort8 trieq1 = spu_cmpeq( tri1, ntri1 ); vec_uchar16 trieq = (vec_uchar16) spu_shuffle( trieq0, trieq1, MERGE ); vec_uchar16 combi = spu_orc(first, trieq); vec_uchar16 canmerge = spu_cmpgt( spu_nor(spu_or(next, nextnext), combi), 256-CHUNKNEXT_BUSY_BIT ); vec_uint4 gather = spu_gather( canmerge ); vec_uint4 mergeid = spu_sub( spu_cntlz( gather ), spu_promote((unsigned int)16, 0)); if( !spu_extract(gather, 0) ) { return; } // unsigned int firstchunk = spu_extract(mergeid, 0); // unsigned int nextchunk = cache->chunkNextArray[firstchunk]; vec_uint4 v_chunkNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(mergeid,13) ); vec_uint4 v_chunkNextNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(v_chunkNext,13) ); // cache->chunkNextArray[firstchunk] = cache->chunkNextArray[nextchunk]; next = spu_shuffle( (vec_uchar16) v_chunkNextNext, next, (vec_uchar16) si_cbd( (qword) mergeid, 0 ) ); // cache->chunkNextArray[nextchunk] = CHUNKNEXT_FREE_BLOCK; next = spu_shuffle( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK), next, (vec_uchar16) si_cbd( (qword) v_chunkNext, 0 ) ); // this is for debug use only, it's not really needed... // cache->chunkStartArray[nextchunk] = -1; cache->chunkStartArray[ spu_extract(v_chunkNext,0) & 255 ] = -1; cache->chunkNext = next; } }
void GenerateFrustum() { // g_R2OCon.m_frustum_planes has 6 planes in the following order: // // 0: near (there is igg code that assumes this is first) // 1: left // 2: right // 3: bottom // 4: top // 5: far (there is igg code that assumes this is last) // the plane normals point into the interior of the frustum // copy T,B,L,R planes directly for (u32 i=1; i<5; i++) { g_Planes[i] = g_pViewData->m_frustum_planes[i]; // loosen frustum based on lod, amplitude, and a fudgefactor f32 nw = spu_extract(g_Planes[i], 3); f32 ny = spu_extract(g_Planes[i], 1); //f32 slop = g_R2OCon.m_frustum_fudge_factor * g_WaterObject.m_amplitude * powf(0.5f, 0.5f*(f32)lod) // * (g_R2OCon.m_step * 0.0625f); // readjust based on changed in lod 0 from 16m to 128m stepsize //f32 slop = g_R2OCon.m_frustum_fudge_factor * powf(0.5f, 0.5f*(f32)lod); f32 slop = g_R2OCon.m_frustum_fudge_factor * step; nw += slop * fabsf(ny); g_Planes[i] = spu_insert(nw, g_Planes[i], 3); } // compute near subfrustum plane based on lod //f32 d = g_R2OCon.m_near0 * powf(0.5f, 0.5f*(f32)lod); f32 d = g_R2OCon.m_near0 * step * (1.0f/128.0f); // test if (lod==g_R2OCon.m_num_lods-1) { d = step; } vf32 camera_direction = (vf32){0,0,1,0}; camera_direction = spu_insert(g_pViewData->m_world_to_camera_matrix.m_v0.z, camera_direction, 0); camera_direction = spu_insert(g_pViewData->m_world_to_camera_matrix.m_v1.z, camera_direction, 1); camera_direction = spu_insert(g_pViewData->m_world_to_camera_matrix.m_v2.z, camera_direction, 2); f32 dot = spu_extract(spu_dot3(camera_direction, g_pViewData->m_camera_position), 0); f32 w = -(dot + d); g_Planes[0] = spu_insert(w, camera_direction, 3); // reverse the sense of the near plane g_Planes[0] = -g_Planes[0]; }
int allequal_llroundf4( llroundf4_t x, llroundf4_t y ) { return ( spu_extract(x.vll[0],0) == spu_extract(y.vll[0],0) && spu_extract(x.vll[0],1) == spu_extract(y.vll[0],1) && spu_extract(x.vll[1],0) == spu_extract(y.vll[1],0) && spu_extract(x.vll[1],1) == spu_extract(y.vll[1],1) ); }
int EulerLagrangeLifchitzPrimalityTest(vec_uint4 N[3], int fSophieGermain) { const vec_uint4 one[3] = { { 0,0,0,0 }, { 0,0,0,0 }, { 0,0,0,1 } }; vec_uint4 r[3]; vec_uint4 N_1[3]; vec_uint4 N_1_2[3]; _mpm_sub(N_1, N, one, 3); MPM_SHR_BITS_LARGE(N_1_2, N_1, 3, zero, 1); _mpm_mod_exp_2(r, N_1_2, 3, N, 3, 6); int nMod8 = spu_extract(N[2], 3) & 7; if (fSophieGermain && (nMod8 == 7)) { if (_mpm_cmpeq(r, one, 3)) return 1; } else if (fSophieGermain && (nMod8 == 3)) { if (_mpm_cmpeq(r, N_1, 3)) return 1; } else if (!fSophieGermain && (nMod8 == 5)) { if (_mpm_cmpeq(r, N_1, 3)) return 1; } else if (!fSophieGermain && (nMod8 == 1)) { if (_mpm_cmpeq(r, one, 3)) return 1; } return 0; }
void writeTriangleBuffer(Triangle* endTriangle) { if (endTriangle != _currentTriangle) { int length = ( ((char*)endTriangle) - _currentTriangleBuffer + 127) & ~127; unsigned short endTriangleBase = (((char*)endTriangle) - ((char*)_currentTriangle)) + _currentTriangleOffset; vec_ushort8 v_new_end = spu_promote(endTriangleBase, 1); // calculate genuine next pointer ( rewind==0 -> next, rewind!=0 -> 0 ) unsigned short next_pointer = spu_extract( spu_andc( v_new_end, _currentTriangleRewind ), 1 ); _currentTriangle->next_triangle = next_pointer; // printf("current=0x%x, endTriBase=0x%x, next_pointer=0x%x\n", _currentTriangleOffset, endTriangleBase, next_pointer); // DMA the triangle data out spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(_currentTriangleBufferEA), mfc_ea2l(_currentTriangleBufferEA), length, 0, MFC_PUT_CMD); // update the information in the cache line _currentTriangleRewind = spu_splats(next_pointer); // re-use this variable as we don't need it anymore char* dstart = ((char*)&_currentTriangleRewind) + (_currentTriangleCacheEndTriangleEAL & 15); spu_mfcdma64(dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL, sizeof(short), 0, MFC_PUTB_CMD); // printf("writing from %x to %x:%x\n", dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL); // finally invalidate the triangle info _currentTriangle = NULL; // and make sure the DMA completed mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } }
unsigned int __mfc_tag_release (unsigned int tag) { vector unsigned int is_invalid; vector unsigned int mask = (vector unsigned int) { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; vector signed int zero = (vector signed int) { 0, 0, 0, 0 }; vector signed int has_been_reserved; /* Check if the tag is out of range. */ is_invalid = spu_cmpgt (spu_promote (tag, 0), 31); /* Check whether the tag has been reserved, set to all 1 if has not been reserved, 0 otherwise. */ has_been_reserved = (vector signed int) spu_rl (__mfc_tag_table, tag); has_been_reserved = (vector signed int) spu_cmpgt (zero, has_been_reserved); /* Set invalid. */ is_invalid = spu_or ((vector unsigned int) has_been_reserved, is_invalid); mask = spu_rlmask (mask, (int)(-tag)); __mfc_tag_table = spu_or (__mfc_tag_table, mask); return spu_extract(is_invalid, 0); }
int num_in_buffer(int side){ volatile vector signed int *head_idx, *tail_idx; int buffer_size; if(side == OUT && mcb[am].local[OUT] < 255){ int parent_idx = mcb[am].local[OUT]; int side = (mcb[am].id+1)&1; head_idx = &md[parent_idx].idx[side][HEAD]; tail_idx = &md[parent_idx].idx[side][TAIL]; buffer_size = mcb[parent_idx].buffer_size[side]; } else { head_idx = &md[am].idx[side][HEAD]; tail_idx = &md[am].idx[side][TAIL]; buffer_size = mcb[am].buffer_size[side]; } vector signed int diff = spu_sub(*head_idx,*tail_idx); int num = spu_extract(diff,0); if(num < 0) num = num + buffer_size; return num; }
unsigned int __mfc_multi_tag_reserve (unsigned int number_of_tags) { vector unsigned int table_copy; vector unsigned int one = (vector unsigned int) { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; vector unsigned int count_busy, is_valid; vector unsigned int count_total; vector unsigned int count_avail = (vector unsigned int) { 0, 0, 0, 0 }; vector unsigned int index = (vector unsigned int) { 0, 0, 0, 0 }; table_copy = __mfc_tag_table; /* count_busy: number of consecutive busy tags count_avail: number of consecutive free tags table_copy: temporary copy of the tag table count_total: sum of count_busy and count_avail index: index of the current working tag */ do { table_copy = spu_sl (table_copy, count_avail); count_busy = spu_cntlz (table_copy); table_copy = spu_sl (table_copy, count_busy); count_avail = spu_cntlz (spu_xor(table_copy, -1)); count_total = spu_add (count_busy, count_avail); index = spu_add (index, count_total); } while (spu_extract (count_avail, 0) < number_of_tags && spu_extract (table_copy, 0) != 0); index = spu_sub (index, count_avail); /* is_valid is set to 0xFFFFFFFF if table_copy == 0, 0 otherwise. */ is_valid = spu_cmpeq (table_copy, 0); index = spu_sel (index, is_valid, is_valid); /* Now I need to actually mark the tags as used. */ table_copy = spu_sl (one, number_of_tags); table_copy = spu_rl (table_copy, -number_of_tags - spu_extract (index, 0)); table_copy = spu_sel (table_copy, __mfc_tag_table, table_copy); __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_valid); return spu_extract (index, 0); }
static int kernel_zfft_f(lwp_functions* pf, void* params, void* inout, unsigned int iter, unsigned int iter_max) { static fft1d_f* obj; Fft_split_params* fftp = (Fft_split_params *)params; unsigned int size = fftp->size; unsigned int chunks = fftp->chunks_per_wb; unsigned int i; int dir = fftp->direction == fwd_fft ? CML_FFT_FWD : CML_FFT_INV; if (iter == iter_max-1 && iter_max * chunks > fftp->chunks_per_spe) chunks = fftp->chunks_per_spe % chunks; assert(size >= MIN_FFT_1D_SIZE); assert(size <= MAX_FFT_1D_SIZE); if (iter == 0 && size != current_size) { #if !NDEBUG // Check that buffer space doesn't overlap with stack register volatile vector unsigned int get_r1 asm("1"); unsigned int stack_pointer = spu_extract(get_r1, 0); assert(buf + 2*MAX_FFT_1D_SIZE + size + 128/4 < stack_pointer); #endif int rt = cml_fft1d_setup_f(&obj, CML_FFT_CC, size, buf + 2*MAX_FFT_1D_SIZE); assert(rt && obj != NULL); current_size = size; } float* inout_re = (float*)inout + 0 * size; float* inout_im = (float*)inout + 1*chunks * size; for (i=0; i<chunks; ++i) { cml_zzfft1d_op_f(obj, (float*)inout_re + i*size, (float*)inout_im + i*size, (float*)inout_re + i*size, (float*)inout_im + i*size, dir, buf); } if (fftp->scale != (double)1.f) { // Instead of regular split svmul: // cml_core_rzsvmul1_f(fftp->scale, out_re,out_im,out_re,out_im,size); // Take advantage of real and imag being contiguous: cml_core_svmul1_f(fftp->scale, inout_re, inout_re, 2*size*chunks); } return 0; }
void * sbrk (ptrdiff_t increment) { static caddr_t heap_ptr = NULL; caddr_t base; vector unsigned int sp_reg, sp_delta; vector unsigned int *sp_ptr; caddr_t sps; /* The stack pointer register. */ volatile register vector unsigned int sp_r1 __asm__("1"); if (heap_ptr == NULL) heap_ptr = (caddr_t) & _end; sps = (caddr_t) spu_extract (sp_r1, 0); if (((int) sps - STACKSIZE - (int) heap_ptr) >= increment) { base = heap_ptr; heap_ptr += increment; sp_delta = (vector unsigned int) spu_insert (increment, spu_splats (0), 1); /* Subtract sp_delta from the SP limit (word 1). */ sp_r1 = spu_sub (sp_r1, sp_delta); /* Fix-up backchain. */ sp_ptr = (vector unsigned int *) spu_extract (sp_r1, 0); do { sp_reg = *sp_ptr; *sp_ptr = (vector unsigned int) spu_sub (sp_reg, sp_delta); } while ((sp_ptr = (vector unsigned int *) spu_extract (sp_reg, 0))); return (base); } else { errno = ENOMEM; return ((void *) -1); } }
static btVector3 convexHullSupport (const btVector3& localDirOrg, const btVector3* points, int numPoints, const btVector3& localScaling) { btVector3 vec = localDirOrg * localScaling; #if defined (__CELLOS_LV2__) && defined (__SPU__) btVector3 localDir = vec; vec_float4 v_distMax = {-FLT_MAX,0,0,0}; vec_int4 v_idxMax = {-999,0,0,0}; int v=0; int numverts = numPoints; for(;v<(int)numverts-4;v+=4) { vec_float4 p0 = vec_dot3(points[v ].get128(),localDir.get128()); vec_float4 p1 = vec_dot3(points[v+1].get128(),localDir.get128()); vec_float4 p2 = vec_dot3(points[v+2].get128(),localDir.get128()); vec_float4 p3 = vec_dot3(points[v+3].get128(),localDir.get128()); const vec_int4 i0 = {v ,0,0,0}; const vec_int4 i1 = {v+1,0,0,0}; const vec_int4 i2 = {v+2,0,0,0}; const vec_int4 i3 = {v+3,0,0,0}; vec_uint4 retGt01 = spu_cmpgt(p0,p1); vec_float4 pmax01 = spu_sel(p1,p0,retGt01); vec_int4 imax01 = spu_sel(i1,i0,retGt01); vec_uint4 retGt23 = spu_cmpgt(p2,p3); vec_float4 pmax23 = spu_sel(p3,p2,retGt23); vec_int4 imax23 = spu_sel(i3,i2,retGt23); vec_uint4 retGt0123 = spu_cmpgt(pmax01,pmax23); vec_float4 pmax0123 = spu_sel(pmax23,pmax01,retGt0123); vec_int4 imax0123 = spu_sel(imax23,imax01,retGt0123); vec_uint4 retGtMax = spu_cmpgt(v_distMax,pmax0123); v_distMax = spu_sel(pmax0123,v_distMax,retGtMax); v_idxMax = spu_sel(imax0123,v_idxMax,retGtMax); } for(;v<(int)numverts;v++) { vec_float4 p = vec_dot3(points[v].get128(),localDir.get128()); const vec_int4 i = {v,0,0,0}; vec_uint4 retGtMax = spu_cmpgt(v_distMax,p); v_distMax = spu_sel(p,v_distMax,retGtMax); v_idxMax = spu_sel(i,v_idxMax,retGtMax); } int ptIndex = spu_extract(v_idxMax,0); const btVector3& supVec= points[ptIndex] * localScaling; return supVec; #else btScalar maxDot; long ptIndex = vec.maxDot( points, numPoints, maxDot); btAssert(ptIndex >= 0); btVector3 supVec = points[ptIndex] * localScaling; return supVec; #endif //__SPU__ }
int main(int argc, char **argv) { int i, j; vector unsigned int int_vec; vector float float_vec; vector double double_vec[4]; /* Get the current time to use as seed */ struct timeval time; gettimeofday(&time, NULL); /* The hardware number generator */ printf("\nHardware Generator:\n"); if(mc_rand_hw_init() == 0) { int_vec = mc_rand_hw_u4(); for(i=0; i<4; i++) printf("%u ",spu_extract(int_vec, i)); printf("\n\n"); } else printf("Hardware RNG is not available.\n\n"); /* The Kirkpatrick Stoll PRNG */ mc_rand_ks_init(time.tv_sec); float_vec = mc_rand_ks_0_to_1_f4(); printf("Kirkpatrick-Stoll:\n"); for(i=0; i<4; i++) printf("%f ", spu_extract(float_vec, i)); printf("\n\n"); /* The Mersenne Twister PRNG */ mc_rand_mt_init(time.tv_sec); mc_rand_mt_minus1_to_1_array_d2(4, double_vec); printf("Mersenne Twister:\n"); for(i=0; i<4; i++) { for(j=0; j<2; j++) printf("%g ",spu_extract(double_vec[i], j)); printf("\n"); } return 0; }
int main(int argc, char **argv) { int i; vector unsigned int all_ones = (vector unsigned int) {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; vector unsigned int all_zeroes = (vector unsigned int) {0x00000000, 0x00000000, 0x00000000, 0x00000000}; /* These bits will form the selection mask */ unsigned short mask = 0x9; /* Each bit in 0x9 forms a word in the mask */ vector unsigned int resultw = spu_sel(all_zeroes, all_ones, spu_maskw(mask)); printf("resultw: "); for (i=0; i<4; i++) { printf("%08x", spu_extract(resultw, i)); } /* Each bit in 0x09 forms a halfword in the mask */ vector unsigned short resulth = spu_sel((vector unsigned short)all_zeroes, (vector unsigned short)all_ones, spu_maskh(mask)); printf("\nresulth: "); for (i=0; i<8; i++) { printf("%04x", spu_extract(resulth, i)); } /* Each bit in 0x0009 forms a byte in the mask */ vector unsigned char resultb = spu_sel((vector unsigned char)all_zeroes, (vector unsigned char)all_ones, spu_maskb(mask)); printf("\nresultb: "); for (i=0; i<16; i++) { printf("%02x", spu_extract(resultb, i)); } printf("\n"); return 0; }
void cp_buffer(int side){ int avail_out = num_free_in_buffer(OUT); int avail_side = num_in_buffer(side); int max = avail_out < avail_side ? avail_out : avail_side; vector signed int *out_head; if(mcb[am].local[OUT] < 255) out_head = (vector signed int*) &md[ mcb[am].local[OUT] ].idx[ (mcb[am].id+1)&1 ][HEAD]; else out_head = (vector signed int*) &md[am].idx[OUT][HEAD]; vector unsigned int cmp_v; vector signed int from_size = spu_splats( mcb[am].buffer_size[side] ); vector signed int out_size = spu_splats( mcb[ mcb[am].local[OUT] ].buffer_size[ (mcb[am].id+1)&1 ] ); vector signed int ones = {1,1,1,1}; vector signed int zeros = {0,0,0,0}; int i; for(i = 0; i < max; i++){ md[am].buffer[OUT][spu_extract( *out_head,0)] = md[am].buffer[side][spu_extract(md[am].idx[side][TAIL],0)]; // update idx md[am].idx[side][TAIL] = spu_add(md[am].idx[side][TAIL], ones); cmp_v = spu_cmpeq(md[am].idx[side][TAIL],from_size); md[am].idx[side][TAIL] = spu_sel(md[am].idx[side][TAIL], zeros, cmp_v); *out_head = spu_add(*out_head,ones); cmp_v = spu_cmpeq(*out_head, out_size); *out_head = spu_sel(*out_head,zeros,cmp_v); } update_tail(side); md[am].consumed[side] += max; if(mcb[am].local[OUT] < 255 && md[am].consumed[side] == mcb[am].data_size[side]){ md[am].depleted[side] = 1; md[am].done = 1; --num_active_mergers; } }
void R2O_CutHoles(u8 *outcodes, i32 width, i32 height, u32 lod) { // get window extents f32 x_min = spu_extract(origin_world, 0); f32 z_min = spu_extract(origin_world, 2); f32 x_max = x_min + (f32)width * step; f32 z_max = z_min + (f32)height * step; // make sure we cut the right water object vu32 coords_u32 = *(vu32 *)&g_WaterObject.m_origin; u32 tag = spu_extract(coords_u32, 0) ^ spu_extract(coords_u32, 2); R2OHole *p_hole = g_Holes; for (u32 i=0; i<g_R2OCon.m_num_holes; i++, p_hole++) { if (tag==p_hole->m_tag && lod==p_hole->m_lod) { // set start of hole f32 x = p_hole->m_xcoord; f32 z = p_hole->m_zcoord; // set coords deltas f32 dx = p_hole->m_dir ? 0.0f : 2.0f*step; f32 dz = p_hole->m_dir ? 2.0f*step : 0.0f; // loop over length of hole for (u32 l=0; l<p_hole->m_cnt; l++) { // test for overlap with the current window if (x>=x_min && x<x_max && z>=z_min && z<z_max) { // translate coords to col/row values i32 c = (i32)((x-x_min) * inv_step); i32 r = (i32)((z-z_min) * inv_step); // punch out the hole outcodes[r*width + c] |= 0x80; // test camera against hole f32 x0 = x - step; f32 x1 = x + step; f32 z0 = z - step; f32 z1 = z + step; f32 x_cam = spu_extract(g_pViewData->m_camera_position, 0); f32 z_cam = spu_extract(g_pViewData->m_camera_position, 2); if (x_cam>=x0 && x_cam<x1 && z_cam>=z0 && z_cam<z1) { g_RenderData.m_b_camera_over_water = false; } } // step coords x += dx; z += dz; } } } }
int main(unsigned long long id) { vector unsigned int x = get_vector_param_3(); vector unsigned int count = (vector unsigned int){0,0,0,0}; vector unsigned int result = (vector unsigned int){0,0,0,0}; spu_ready(); count = popc(x); result = reduce_word(count); spu_write_out_mbox(spu_extract(result, 0)); return SPU_SUCCESS; }
int strncmp_ea (__ea void *s1, __ea const void *s2, size_ea_t n3) { __ea void *curr_s1 = (__ea void *) s1; __ea void *curr_s2 = (__ea void *) s2; void *l_s1; void *l_s2; int min; size_ea_t s2_n; size_ea_t s1_n; int ret; vec_uint4 end_v; ret = 0; /* in case n3 is 0 */ while (n3) { l_s2 = __cache_fetch (curr_s2); l_s1 = __cache_fetch (curr_s1); /* * Use the smaller of the size left to compare (n3), the space left in * s2 cacheline (s2_n), or the space left in the s1 cacheline (s1_n) */ s2_n = ROUND_UP_NEXT_128 ((size_ea_t) curr_s2) - (size_ea_t) curr_s2; s1_n = ROUND_UP_NEXT_128 ((size_ea_t) curr_s1) - (size_ea_t) curr_s1; min = three_way_min (s2_n, s1_n, n3); ret = _strncmp_internal (l_s1, l_s2, min, &end_v, 1); /* * Only the first slot of end_v is set. */ /* if (ret || spu_extract(spu_cmpeq(end_v, 0), 0)) { */ /* if (ret || spu_extract(spu_gather(spu_cmpeq(end_v, 0)), 0)) { */ if (ret || spu_extract (end_v, 0)) /* * If any NUL values were seen (end_v values of zero) we still have * to return ret, as it might not be zero. */ return ret; curr_s2 += min; curr_s1 += min; n3 -= min; } return ret; }
unsigned int __mfc_tag_reserve (void) { vector unsigned int mask = (vector unsigned int) { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; vector unsigned int count_zeros, is_valid; vector signed int count_neg; count_zeros = spu_cntlz (__mfc_tag_table); count_neg = spu_sub (0, (vector signed int) count_zeros); mask = spu_rlmask (mask, (vector signed int) count_neg); __mfc_tag_table = spu_andc (__mfc_tag_table, mask); is_valid = spu_cmpeq (count_zeros, 32); count_zeros = spu_sel (count_zeros, is_valid, is_valid); return spu_extract (count_zeros, 0); }
void pull(int side){ int avail_in = num_free_in_buffer(side); int avail_mm = mcb[am].data_size[side] - md[am].num_pulled[side]; int num_pull = avail_in < avail_mm ? avail_in : avail_mm; num_pull = num_pull < MAX_DMA_SIZE ? num_pull : MAX_DMA_SIZE; int head = spu_extract(md[am].idx[side][HEAD],0); int avail_from_head = mcb[am].buffer_size[side] - head; int first_pull = num_pull < avail_from_head ? num_pull : avail_from_head; if(!first_pull) return; // pull #first_pull unsigned int to_ea = (unsigned int) &md[am].buffer[side][head]; int tag = mfc_tag_reserve(); if(tag == MFC_TAG_INVALID){ return; } else { md[am].held_tag[side] = tag; } mfc_get((void*)to_ea, mcb[am].block_addr[side], first_pull * sizeof(vector signed int), md[am].held_tag[side], 0,0); mcb[am].block_addr[side] += first_pull * sizeof(vector signed int); if(first_pull < num_pull){ to_ea = (unsigned int) &md[am].buffer[side][0]; int second_pull = num_pull - first_pull; mfc_get((void*)to_ea, mcb[am].block_addr[side], second_pull * sizeof(vector signed int), md[am].held_tag[side], 0,0); mcb[am].block_addr[side] += second_pull * sizeof(vector signed int); } md[am].num_waiting[side] = num_pull; }
unsigned int __mfc_multi_tag_release (unsigned int first_tag, unsigned int number_of_tags) { vector unsigned int table_copy, tmp, tmp1; vector unsigned int one = (vector unsigned int) { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; vector unsigned int is_invalid; unsigned int last_tag; vector unsigned int has_been_reserved; last_tag = first_tag + number_of_tags; table_copy = spu_sl (one, number_of_tags); table_copy = spu_rl (table_copy, -last_tag); table_copy = spu_xor (table_copy, -1); /* Make sure the tags are in range and valid. */ tmp = spu_cmpgt (spu_promote(last_tag, 0), 32); tmp1 = spu_cmpgt (spu_promote(number_of_tags, 0), 32); is_invalid = spu_cmpgt (spu_promote(first_tag, 0), 31); /* All bits are set to 1 if invalid, 0 if valid. */ is_invalid = spu_or (tmp, is_invalid); is_invalid = spu_or (tmp1, is_invalid); /* check whether these tags have been reserved */ tmp = spu_rlmask (one, (int)-number_of_tags); tmp1 = spu_sl (__mfc_tag_table, first_tag); has_been_reserved = spu_cmpgt(tmp1, tmp); is_invalid = spu_or (has_been_reserved, is_invalid); table_copy = spu_sel (__mfc_tag_table, table_copy, table_copy); __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_invalid); return spu_extract (is_invalid, 0); }
/** * Setup fragment shader inputs by evaluating triangle's vertex * attribute coefficient info. * \param x quad x pos * \param y quad y pos * \param fragZ returns quad Z values * \param fragInputs returns fragment program inputs * Note: this code could be incorporated into the fragment program * itself to avoid the loop and switch. */ static void eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[]) { static const vector float deltaX = (const vector float) {0, 1, 0, 1}; static const vector float deltaY = (const vector float) {0, 0, 1, 1}; const uint posSlot = 0; const vector float pos = setup.coef[posSlot].a0; const vector float dposdx = setup.coef[posSlot].dadx; const vector float dposdy = setup.coef[posSlot].dady; const vector float fragX = spu_splats(x) + deltaX; const vector float fragY = spu_splats(y) + deltaY; vector float fragW, wInv; uint i; *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy); fragW = splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy); wInv = spu_re(fragW); /* 1 / w */ /* loop over fragment program inputs */ for (i = 0; i < spu.vertex_info.num_attribs; i++) { uint attr = i + 1; enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode; /* constant term */ vector float a0 = setup.coef[attr].a0; vector float r0 = splatx(a0); vector float r1 = splaty(a0); vector float r2 = splatz(a0); vector float r3 = splatw(a0); if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) { /* linear term */ vector float dadx = setup.coef[attr].dadx; vector float dady = setup.coef[attr].dady; /* Use SPU intrinsics here to get slightly better code. * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady); */ r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0)); r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1)); r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2)); r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3)); if (interp == INTERP_PERSPECTIVE) { /* perspective term */ r0 *= wInv; r1 *= wInv; r2 *= wInv; r3 *= wInv; } } fragInputs[CHAN0] = r0; fragInputs[CHAN1] = r1; fragInputs[CHAN2] = r2; fragInputs[CHAN3] = r3; fragInputs += 4; } } /** * Emit a quad (pass to next stage). No clipping is done. * Note: about 1/5 to 1/7 of the time, mask is zero and this function * should be skipped. But adding the test for that slows things down * overall. */ static INLINE void emit_quad( int x, int y, mask_t mask) { /* If any bits in mask are set... */ if (spu_extract(spu_orx(mask), 0)) { const int ix = x - setup.cliprect_minx; const int iy = y - setup.cliprect_miny; spu.cur_ctile_status = TILE_STATUS_DIRTY; spu.cur_ztile_status = TILE_STATUS_DIRTY; { /* * Run fragment shader, execute per-fragment ops, update fb/tile. */ vector float inputs[4*4], outputs[2*4]; vector unsigned int kill_mask; vector float fragZ; eval_inputs((float) x, (float) y, &fragZ, inputs); ASSERT(spu.fragment_program); ASSERT(spu.fragment_ops); /* Execute the current fragment program */ kill_mask = spu.fragment_program(inputs, outputs, spu.constants); mask = spu_andc(mask, kill_mask); /* Execute per-fragment/quad operations, including: * alpha test, z test, stencil test, blend and framebuffer writing. * Note that there are two different fragment operations functions * that can be called, one for front-facing fragments, and one * for back-facing fragments. (Often the two are the same; * but in some cases, like two-sided stenciling, they can be * very different.) So choose the correct function depending * on the calculated facing. */ spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile, fragZ, outputs[0*4+0], outputs[0*4+1], outputs[0*4+2], outputs[0*4+3], mask); } } } /** * Given an X or Y coordinate, return the block/quad coordinate that it * belongs to. */ static INLINE int block(int x) { return x & ~1; } /** * Render a horizontal span of quads */ static void flush_spans(void) { int minleft, maxright; const int l0 = spu_extract(setup.span.quad, 0); const int l1 = spu_extract(setup.span.quad, 1); const int r0 = spu_extract(setup.span.quad, 2); const int r1 = spu_extract(setup.span.quad, 3); switch (setup.span.y_flags) { case 0x3: /* both odd and even lines written (both quad rows) */ minleft = MIN2(l0, l1); maxright = MAX2(r0, r1); break; case 0x1: /* only even line written (quad top row) */ minleft = l0; maxright = r0; break; case 0x2: /* only odd line written (quad bottom row) */ minleft = l1; maxright = r1; break; default: return; } /* OK, we're very likely to need the tile data now. * clear or finish waiting if needed. */ if (spu.cur_ctile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ctile\n", spu.init.id); wait_on_mask(1 << TAG_READ_TILE_COLOR); spu.cur_ctile_status = TILE_STATUS_CLEAN; } else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) { //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty); clear_c_tile(&spu.ctile); spu.cur_ctile_status = TILE_STATUS_DIRTY; } ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); if (spu.read_depth_stencil) { if (spu.cur_ztile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ztile\n", spu.init.id); wait_on_mask(1 << TAG_READ_TILE_Z); spu.cur_ztile_status = TILE_STATUS_CLEAN; } else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) { //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty); clear_z_tile(&spu.ztile); spu.cur_ztile_status = TILE_STATUS_DIRTY; } ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED); } /* XXX this loop could be moved into the above switch cases... */ /* Setup for mask calculation */ const vec_int4 quad_LlRr = setup.span.quad; const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8); const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B)); const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B)); const vec_int4 twos = spu_splats(2); const int x = block(minleft); vec_int4 xs = {x, x+1, x, x+1}; for (; spu_extract(xs, 0) <= block(maxright); xs += twos) { /** * Computes mask to indicate which pixels in the 2x2 quad are actually * inside the triangle's bounds. */ /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */ const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs); const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */ const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs); /* Combine results to create mask */ const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs); emit_quad(spu_extract(xs, 0), setup.span.y, mask); } setup.span.y = 0; setup.span.y_flags = 0; /* Zero right elements */ setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0)); } #if DEBUG_VERTS static void print_vertex(const struct vertex_header *v) { uint i; fprintf(stderr, " Vertex: (%p)\n", v); for (i = 0; i < spu.vertex_info.num_attribs; i++) { fprintf(stderr, " %d: %f %f %f %f\n", i, spu_extract(v->data[i], 0), spu_extract(v->data[i], 1), spu_extract(v->data[i], 2), spu_extract(v->data[i], 3)); } }
void ClipToRectangle(vf32 clip_min, vf32 clip_max) { // convert from world coords to integer rows & cols vf32 norm_min = spu_mul(clip_min - origin_world, spu_splats(inv_step)); vf32 norm_max = spu_mul(clip_max - origin_world, spu_splats(inv_step)); vi32 int_min = VecFloor4(norm_min); vi32 int_max = VecCeil4 (norm_max); // expand rectangle by 1 gridpoint because quads incident to the verts we're about to cull out will also be culled out i32 c_min = spu_extract(int_min, 0) - 1; i32 c_max = spu_extract(int_max, 0) + 1; i32 r_min = spu_extract(int_min, 2) - 1; i32 r_max = spu_extract(int_max, 2) + 1; // trim loop bounds to rectangle so we don't splat memory i32 c0 = c_min >= 0 ? c_min : 0; i32 c1 = c_max < nc ? c_max+1 : nc; i32 r0 = r_min >= 0 ? r_min : 0; i32 r1 = r_max < nr ? r_max+1 : nr; // cull left points if (c_min>=0 && c_min<nc) { u8 *p = &g_Outcodes[r0*nc+c_min]; for (i32 r=r0; r<r1; r++,p+=nc) { *p |= 0x80; } } // cull right points if (c_max>=0 && c_max<nc) { u8 *p = &g_Outcodes[r0*nc+c_max]; for (i32 r=r0; r<r1; r++,p+=nc) { *p |= 0x80; } } // cull upper points if (r_min>=0 && r_min<nr) { u8 *p = &g_Outcodes[r_min*nc+c0]; for (i32 c=c0; c<c1; c++,p++) { *p |= 0x80; } } // cull lower points if (r_max>=0 && r_max<nr) { u8 *p = &g_Outcodes[r_max*nc+c0]; for (i32 c=c0; c<c1; c++,p++) { *p |= 0x80; } } }
void InitBasisEtc() { // Use a fixed initial step size for now; 128m for lod 0. // This yields an fft tile size of 32 x 128m = 4096m for lod 0 // and maximum dimensions of 8192m x 8192m step = g_R2OCon.m_step; vf32 step_vec = (vf32){step, 0, step, 0}; // get inverse-step using float magic (since taking the reciprocal of a power of 2 yields a 1-bit error) qword q_step = si_from_float(step); qword q_magic = si_ilhu(0x7F00); inv_step = si_to_float(si_sf(q_step, q_magic)); // set clip window clip_min = g_WaterObject.m_origin; clip_max = g_WaterObject.m_origin + g_WaterObject.m_dimensions; // set origin at gridpoint below clip min f32 magic_float = 1.5f * 8388608.0f * step; vf32 magic_vf32 = (vf32){magic_float, 0, magic_float, 0}; origin_world = (clip_min + magic_vf32) - magic_vf32; // compute gridpoint above clip max vf32 max_corner = (clip_max + magic_vf32) - magic_vf32; max_corner += step_vec; // offset both corners by the necessary amount of padding origin_world -= step_vec * spu_splats(8.0f); max_corner += step_vec * spu_splats(8.0f); // set num cols & num rows vf32 dims = max_corner - origin_world; nc = (i32)(spu_extract(dims,0) * inv_step) + 1; nr = (i32)(spu_extract(dims,2) * inv_step) + 1; // record true nc, nr true_nc = nc - 16; true_nr = nr - 16; // alignment requirements (ooh, that's a bit strict) nc = (nc + 7) & -8; nr = (nr + 7) & -8; // deal with large grids if (nc > 80) { nc = 80; true_nc = 64; dims = spu_insert((nc-1)*step, dims, 0); } if (nr > 80) { nr = 80; true_nr = 64; dims = spu_insert((nr-1)*step, dims, 2); } max_corner = origin_world + dims; even_step = step; even_inv_step = inv_step; even_basis_col = (vf32){1.0f, 0.0f, 0.0f, 0.0f}; even_basis_row = (vf32){0.0f, 0.0f, 1.0f, 0.0f}; const f32 r = 0.707106781187f; odd_step = even_step * r; odd_inv_step= even_inv_step * r * 2.0f; odd_basis_col = (vf32){ r, 0.0f, r, 0.0f}; odd_basis_row = (vf32){-r, 0.0f, r, 0.0f}; basis_col = even_basis_col; basis_row = even_basis_row; dvc_world = spu_splats(step) * basis_col; dvr_world = spu_splats(step) * basis_row; // set base lod origin g_RenderData.m_origins[0] = origin_world; g_RenderData.m_cols_rows[0] = nc<<8 | nr; c0_amb = 0; r0_amb = 0; SetBasisEtc(0,0); }