/** * @v * @vth */ inline vec_float4 updateF(vec_float4 v,vec_float4 vth) { vec_float4 updateF_rtn; updateF_rtn = spu_sel(updateF_rtn,1.0,spu_cmpgt(spu_sub(v,vth),0.0)); updateF_rtn = spu_sel(updateF_rtn,0.0,spu_nand(spu_cmpgt(spu_sub(v,vth),0.0),spu_cmpgt(spu_sub(v,vth),0.0))); return updateF_rtn; }
void compute( in0_type const& in, out0_type const& out, Pinfo const& p_in, Pinfo const& p_out) { #if DEBUG printf("uk_ccfft_f(%d): compute -- start %05x %05x\n", size, in, out); #endif // Handle inverse FFT explicitly so that shuffle and scale can happen // in single step. cml_core_ccfft1d_op_mi_f(fft, (float*)in, (float*)out, CML_FFT_FWD); if (dir == -1) { if (scale != 1.f) cml_core_rcsvmul1_f(scale, (float*)out, (float*)out, size); } else { // Code for the inverse FFT taken from the CBE SDK Libraries // Overview and Users Guide, sec. 8.1. int const vec_size = 4; vector float* start = (vector float*)out; vector float* end = start + 2 * size / vec_size; vector float s0, s1, e0, e1; vector unsigned int mask = (vector unsigned int){-1, -1, 0, 0}; vector float vscale = spu_splats(scale); unsigned int i; // Scale the output vector and swap the order of the outputs. // Note: there are two float values for each of 'n' complex values. s0 = e1 = *start; for (i = 0; i < size / vec_size; ++i) { s1 = *(start + 1); e0 = *(--end); *start++ = spu_mul(spu_sel(e0, e1, mask), vscale); *end = spu_mul(spu_sel(s0, s1, mask), vscale); s0 = s1; e1 = e0; } } } // Member data size_t size; int dir; float scale; fft1d_f* fft; static char buf1[FFT_BUF1_SIZE_BYTES]; static char buf2[FFT_BUF2_SIZE_BYTES]; };
static btVector3 convexHullSupport (const btVector3& localDirOrg, const btVector3* points, int numPoints, const btVector3& localScaling) { btVector3 vec = localDirOrg * localScaling; #if defined (__CELLOS_LV2__) && defined (__SPU__) btVector3 localDir = vec; vec_float4 v_distMax = {-FLT_MAX,0,0,0}; vec_int4 v_idxMax = {-999,0,0,0}; int v=0; int numverts = numPoints; for(;v<(int)numverts-4;v+=4) { vec_float4 p0 = vec_dot3(points[v ].get128(),localDir.get128()); vec_float4 p1 = vec_dot3(points[v+1].get128(),localDir.get128()); vec_float4 p2 = vec_dot3(points[v+2].get128(),localDir.get128()); vec_float4 p3 = vec_dot3(points[v+3].get128(),localDir.get128()); const vec_int4 i0 = {v ,0,0,0}; const vec_int4 i1 = {v+1,0,0,0}; const vec_int4 i2 = {v+2,0,0,0}; const vec_int4 i3 = {v+3,0,0,0}; vec_uint4 retGt01 = spu_cmpgt(p0,p1); vec_float4 pmax01 = spu_sel(p1,p0,retGt01); vec_int4 imax01 = spu_sel(i1,i0,retGt01); vec_uint4 retGt23 = spu_cmpgt(p2,p3); vec_float4 pmax23 = spu_sel(p3,p2,retGt23); vec_int4 imax23 = spu_sel(i3,i2,retGt23); vec_uint4 retGt0123 = spu_cmpgt(pmax01,pmax23); vec_float4 pmax0123 = spu_sel(pmax23,pmax01,retGt0123); vec_int4 imax0123 = spu_sel(imax23,imax01,retGt0123); vec_uint4 retGtMax = spu_cmpgt(v_distMax,pmax0123); v_distMax = spu_sel(pmax0123,v_distMax,retGtMax); v_idxMax = spu_sel(imax0123,v_idxMax,retGtMax); } for(;v<(int)numverts;v++) { vec_float4 p = vec_dot3(points[v].get128(),localDir.get128()); const vec_int4 i = {v,0,0,0}; vec_uint4 retGtMax = spu_cmpgt(v_distMax,p); v_distMax = spu_sel(p,v_distMax,retGtMax); v_idxMax = spu_sel(i,v_idxMax,retGtMax); } int ptIndex = spu_extract(v_idxMax,0); const btVector3& supVec= points[ptIndex] * localScaling; return supVec; #else btScalar maxDot; long ptIndex = vec.maxDot( points, numPoints, maxDot); btAssert(ptIndex >= 0); btVector3 supVec = points[ptIndex] * localScaling; return supVec; #endif //__SPU__ }
void check_pull_dma(int side){ // Check left if(md[am].held_tag[side] < 32){ mfc_write_tag_mask( 1 << md[am].held_tag[side] ); int status = mfc_read_tag_status_immediate(); if(status){ // Update idx md[am].idx[side][HEAD] = spu_add(md[am].idx[side][HEAD], md[am].num_waiting[side]); vector signed int buffer_size = spu_splats(mcb[am].buffer_size[side] -1); vector unsigned int cmp_v = spu_cmpgt(md[am].idx[side][HEAD], buffer_size); vector signed int zeros = {0,0,0,0}; buffer_size = spu_add(buffer_size,1); zeros = spu_sel(zeros,buffer_size,cmp_v); md[am].idx[side][HEAD] = spu_sub(md[am].idx[side][HEAD],zeros); md[am].num_pulled[side] += md[am].num_waiting[side]; md[am].num_waiting[side] = 0; if(md[am].num_pulled[side] == mcb[am].data_size[side]){ md[am].mm_depleted[side] = 1; } // Release tag mfc_tag_release( md[am].held_tag[side] ); md[am].held_tag[side] = 32; } } }
vec_uint4 ulpDiff_f4(vec_float4 ref, vec_float4 vals) { vec_int4 refi = (vec_int4)ref; vec_int4 valsi = (vec_int4)vals; vec_int4 diff = spu_sub(refi, valsi); vec_int4 negdiff = spu_sub(spu_splats((int)0), diff); return (vec_uint4)(spu_sel(negdiff, diff, spu_cmpgt(diff, 0))); }
unsigned int __mfc_multi_tag_reserve (unsigned int number_of_tags) { vector unsigned int table_copy; vector unsigned int one = (vector unsigned int) { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; vector unsigned int count_busy, is_valid; vector unsigned int count_total; vector unsigned int count_avail = (vector unsigned int) { 0, 0, 0, 0 }; vector unsigned int index = (vector unsigned int) { 0, 0, 0, 0 }; table_copy = __mfc_tag_table; /* count_busy: number of consecutive busy tags count_avail: number of consecutive free tags table_copy: temporary copy of the tag table count_total: sum of count_busy and count_avail index: index of the current working tag */ do { table_copy = spu_sl (table_copy, count_avail); count_busy = spu_cntlz (table_copy); table_copy = spu_sl (table_copy, count_busy); count_avail = spu_cntlz (spu_xor(table_copy, -1)); count_total = spu_add (count_busy, count_avail); index = spu_add (index, count_total); } while (spu_extract (count_avail, 0) < number_of_tags && spu_extract (table_copy, 0) != 0); index = spu_sub (index, count_avail); /* is_valid is set to 0xFFFFFFFF if table_copy == 0, 0 otherwise. */ is_valid = spu_cmpeq (table_copy, 0); index = spu_sel (index, is_valid, is_valid); /* Now I need to actually mark the tags as used. */ table_copy = spu_sl (one, number_of_tags); table_copy = spu_rl (table_copy, -number_of_tags - spu_extract (index, 0)); table_copy = spu_sel (table_copy, __mfc_tag_table, table_copy); __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_valid); return spu_extract (index, 0); }
int main(int argc, char **argv) { int i; vector unsigned int all_ones = (vector unsigned int) {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; vector unsigned int all_zeroes = (vector unsigned int) {0x00000000, 0x00000000, 0x00000000, 0x00000000}; /* These bits will form the selection mask */ unsigned short mask = 0x9; /* Each bit in 0x9 forms a word in the mask */ vector unsigned int resultw = spu_sel(all_zeroes, all_ones, spu_maskw(mask)); printf("resultw: "); for (i=0; i<4; i++) { printf("%08x", spu_extract(resultw, i)); } /* Each bit in 0x09 forms a halfword in the mask */ vector unsigned short resulth = spu_sel((vector unsigned short)all_zeroes, (vector unsigned short)all_ones, spu_maskh(mask)); printf("\nresulth: "); for (i=0; i<8; i++) { printf("%04x", spu_extract(resulth, i)); } /* Each bit in 0x0009 forms a byte in the mask */ vector unsigned char resultb = spu_sel((vector unsigned char)all_zeroes, (vector unsigned char)all_ones, spu_maskb(mask)); printf("\nresultb: "); for (i=0; i<16; i++) { printf("%02x", spu_extract(resultb, i)); } printf("\n"); return 0; }
void cp_buffer(int side){ int avail_out = num_free_in_buffer(OUT); int avail_side = num_in_buffer(side); int max = avail_out < avail_side ? avail_out : avail_side; vector signed int *out_head; if(mcb[am].local[OUT] < 255) out_head = (vector signed int*) &md[ mcb[am].local[OUT] ].idx[ (mcb[am].id+1)&1 ][HEAD]; else out_head = (vector signed int*) &md[am].idx[OUT][HEAD]; vector unsigned int cmp_v; vector signed int from_size = spu_splats( mcb[am].buffer_size[side] ); vector signed int out_size = spu_splats( mcb[ mcb[am].local[OUT] ].buffer_size[ (mcb[am].id+1)&1 ] ); vector signed int ones = {1,1,1,1}; vector signed int zeros = {0,0,0,0}; int i; for(i = 0; i < max; i++){ md[am].buffer[OUT][spu_extract( *out_head,0)] = md[am].buffer[side][spu_extract(md[am].idx[side][TAIL],0)]; // update idx md[am].idx[side][TAIL] = spu_add(md[am].idx[side][TAIL], ones); cmp_v = spu_cmpeq(md[am].idx[side][TAIL],from_size); md[am].idx[side][TAIL] = spu_sel(md[am].idx[side][TAIL], zeros, cmp_v); *out_head = spu_add(*out_head,ones); cmp_v = spu_cmpeq(*out_head, out_size); *out_head = spu_sel(*out_head,zeros,cmp_v); } update_tail(side); md[am].consumed[side] += max; if(mcb[am].local[OUT] < 255 && md[am].consumed[side] == mcb[am].data_size[side]){ md[am].depleted[side] = 1; md[am].done = 1; --num_active_mergers; } }
unsigned int __mfc_multi_tag_release (unsigned int first_tag, unsigned int number_of_tags) { vector unsigned int table_copy, tmp, tmp1; vector unsigned int one = (vector unsigned int) { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; vector unsigned int is_invalid; unsigned int last_tag; vector unsigned int has_been_reserved; last_tag = first_tag + number_of_tags; table_copy = spu_sl (one, number_of_tags); table_copy = spu_rl (table_copy, -last_tag); table_copy = spu_xor (table_copy, -1); /* Make sure the tags are in range and valid. */ tmp = spu_cmpgt (spu_promote(last_tag, 0), 32); tmp1 = spu_cmpgt (spu_promote(number_of_tags, 0), 32); is_invalid = spu_cmpgt (spu_promote(first_tag, 0), 31); /* All bits are set to 1 if invalid, 0 if valid. */ is_invalid = spu_or (tmp, is_invalid); is_invalid = spu_or (tmp1, is_invalid); /* check whether these tags have been reserved */ tmp = spu_rlmask (one, (int)-number_of_tags); tmp1 = spu_sl (__mfc_tag_table, first_tag); has_been_reserved = spu_cmpgt(tmp1, tmp); is_invalid = spu_or (has_been_reserved, is_invalid); table_copy = spu_sel (__mfc_tag_table, table_copy, table_copy); __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_invalid); return spu_extract (is_invalid, 0); }
unsigned int __mfc_tag_reserve (void) { vector unsigned int mask = (vector unsigned int) { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; vector unsigned int count_zeros, is_valid; vector signed int count_neg; count_zeros = spu_cntlz (__mfc_tag_table); count_neg = spu_sub (0, (vector signed int) count_zeros); mask = spu_rlmask (mask, (vector signed int) count_neg); __mfc_tag_table = spu_andc (__mfc_tag_table, mask); is_valid = spu_cmpeq (count_zeros, 32); count_zeros = spu_sel (count_zeros, is_valid, is_valid); return spu_extract (count_zeros, 0); }
/* Scans the string pointed to by s for the character c and * returns a pointer to the last occurance of c. If * c is not found, then NULL is returned. */ char * strrchr(const char *s, int c) { int nskip; vec_uchar16 *ptr, data, vc; vec_uint4 cmp_c, cmp_0, cmp; vec_uint4 res_ptr, res_cmp; vec_uint4 mask, result; vec_uint4 one = spu_splats(0xffffU); /* Scan memory array a quadword at a time. Skip leading * mis-aligned bytes. */ ptr = (vec_uchar16 *)s; nskip = -((unsigned int)(ptr) & 15); mask = spu_rlmask(one, nskip); vc = spu_splats((unsigned char)(c)); data = *ptr++; ptr = (vec_uchar16 *)((unsigned int)ptr & ~15); cmp_c = spu_and(spu_gather(spu_cmpeq(data, vc)), mask); cmp_0 = spu_and(spu_gather(spu_cmpeq(data, 0)), mask); res_ptr = spu_splats(0U); res_cmp = spu_splats(0U); while (spu_extract(cmp_0, 0) == 0) { cmp = spu_cmpeq(cmp_c, 0); res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp); res_cmp = spu_sel(cmp_c, res_cmp, cmp); data = *ptr++; cmp_c = spu_gather(spu_cmpeq(data, vc)); cmp_0 = spu_gather(spu_cmpeq(data, 0)); cmp = spu_cmpeq(cmp_c, 0); } /* Compute the location of the last character before termination * character. * * First mask off compare results following the first termination character. */ mask = spu_sl(one, 31 - spu_extract(spu_cntlz(cmp_0), 0)); cmp_c = spu_and(cmp_c, mask); /* Conditionally update res_ptr and res_cmd if a match was found in the last * quadword. */ cmp = spu_cmpeq(cmp_c, 0); res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp); res_cmp = spu_sel(cmp_c, res_cmp, cmp); /* Bit reserve res_cmp for locating last occurance. */ mask = spu_cmpeq(res_cmp, 0); res_cmp = (vec_uint4)spu_maskb(spu_extract(res_cmp, 0)); res_cmp = spu_gather((vec_uchar16)spu_shuffle(res_cmp, res_cmp, VEC_LITERAL(vec_uchar16, 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0))); /* Compute the location (ptr) of the last occurance of c. If no * occurance was found (ie, element 0 of res_cmp == 0, then return * NULL. */ result = spu_sub(spu_add(res_ptr, 15), spu_cntlz(res_cmp)); result = spu_andc(result, mask); return ((char *)spu_extract(result, 0)); }
static inline vec_uint4 vec_Maj(vec_uint4 x, vec_uint4 y, vec_uint4 z) { return spu_sel(x, y, spu_xor(x, z)); }
static inline vec_uint4 vec_Ch(vec_uint4 x, vec_uint4 y, vec_uint4 z) { return spu_sel(z, y, x); }
vector double __divv2df3 (vector double a_in, vector double b_in) { /* Variables */ vec_int4 exp, exp_bias; vec_uint4 no_underflow, overflow; vec_float4 mant_bf, inv_bf; vec_ullong2 exp_a, exp_b; vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0; vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0; vec_ullong2 nan; vec_uint4 a_exp, b_exp; vec_ullong2 a_mant_0, b_mant_0; vec_ullong2 a_exp_1s, b_exp_1s; vec_ullong2 sign_exp_mask; vec_double2 a, b; vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult; /* Constants */ vec_uint4 exp_mask_u32 = spu_splats((unsigned int)0x7FF00000); vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8, 9,10,11, 8,9,10,11 }; vec_uchar16 swap_32 = (vec_uchar16) { 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL); vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL); vec_float4 onef = spu_splats(1.0f); vec_double2 one = spu_splats(1.0); vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL); sign_exp_mask = spu_or(sign_mask, exp_mask); /* Extract the floating point components from each of the operands including * exponent and mantissa. */ a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32); a_exp = spu_shuffle(a_exp, a_exp, splat_hi); b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32); b_exp = spu_shuffle(b_exp, b_exp, splat_hi); a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0); a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32)); b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0); b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32)); a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32); b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32); /* Identify all possible special values that must be accommodated including: * +-denorm, +-0, +-infinity, and NaNs. */ a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0); a_nan = spu_andc(a_exp_1s, a_mant_0); a_zero = spu_and (a_denorm0, a_mant_0); a_inf = spu_and (a_exp_1s, a_mant_0); a_denorm = spu_andc(a_denorm0, a_zero); b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0); b_nan = spu_andc(b_exp_1s, b_mant_0); b_zero = spu_and (b_denorm0, b_mant_0); b_inf = spu_and (b_exp_1s, b_mant_0); b_denorm = spu_andc(b_denorm0, b_zero); /* Scale denorm inputs to into normalized numbers by conditionally scaling the * input parameters. */ a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask)); a = spu_sel(a_in, a, a_denorm); b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask)); b = spu_sel(b_in, b, b_denorm); /* Extract the divisor and dividend exponent and force parameters into the signed * range [1.0,2.0) or [-1.0,2.0). */ exp_a = spu_and((vec_ullong2)a, exp_mask); exp_b = spu_and((vec_ullong2)b, exp_mask); mant_a = spu_sel(a, one, (vec_ullong2)exp_mask); mant_b = spu_sel(b, one, (vec_ullong2)exp_mask); /* Approximate the single reciprocal of b by using * the single precision reciprocal estimate followed by one * single precision iteration of Newton-Raphson. */ mant_bf = spu_roundtf(mant_b); inv_bf = spu_re(mant_bf); inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf); /* Perform 2 more Newton-Raphson iterations in double precision. The * result (q1) is in the range (0.5, 2.0). */ inv_b = spu_extend(inv_bf); inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b); q0 = spu_mul(mant_a, inv_b); q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0); /* Determine the exponent correction factor that must be applied * to q1 by taking into account the exponent of the normalized inputs * and the scale factors that were applied to normalize them. */ exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20); exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34))); /* Bias the quotient exponent depending on the sign of the exponent correction * factor so that a single multiplier will ensure the entire double precision * domain (including denorms) can be achieved. * * exp bias q1 adjust exp * ===== ======== ========== * positive 2^+65 -65 * negative 2^-64 +64 */ exp_bias = spu_xor(spu_rlmaska(exp, -31), 64); exp = spu_sub(exp, exp_bias); q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask); /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the * expected result. On overflow, clamp the multiplier to the maximum non-infinite * number in case the rounding mode is not round-to-nearest. */ exp = spu_add(exp, 0x3FF); no_underflow = spu_cmpgt(exp, 0); overflow = spu_cmpgt(exp, 0x7FE); exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow); exp = spu_and(exp, (vec_int4)exp_mask); mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow); /* Handle special value conditions. These include: * * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN * results. * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results. * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results. */ mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf)); mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero)); nan = spu_or(a_nan, b_nan); nan = spu_or(nan, spu_and(a_zero, b_zero)); nan = spu_or(nan, spu_and(a_inf, b_inf)); mult = spu_or(mult, (vec_double2)nan); /* Scale the final quotient */ q2 = spu_mul(q1, mult); return (q2); }
void draw_frame(uint64_t buf_ea) { vec_uint4 buf[2*1920/4]; int row, col, i, tag = 0; float step = 4.0f/spu.width*spu.zoom; float xbeg = spu.xc - spu.width*step*0.5f; vec_float4 vxbeg = spu_splats(xbeg) + spu_splats(step) * (vec_float4) { 0.f,1.f,2.f,3.f }; vec_float4 xstep = spu_splats(step)*spu_splats(4.f); vec_float4 vyp = spu_splats(spu.yc - spu.height*step*0.5f + step*spu.rank); const vec_float4 vinc = spu_splats(spu.count * step); const vec_float4 esc2 = spu_splats(BAILOUT*BAILOUT); #if BAILBITS != 1 const vec_float4 esc21 = spu_splats(4.f/(BAILOUT*BAILOUT)); #endif const vec_float4 two = spu_splats(2.f); const vec_float4 zero = spu_splats(0.f); const vec_float4 colsc = spu_splats(255.f); const vec_float4 ccr = spu_splats(4.f*BAILOUT/(3.5f*3.141592654f)); const vec_float4 ccg = spu_splats(4.f*BAILOUT/(5.f*3.141592654f)); const vec_float4 ccb = spu_splats(4.f*BAILOUT/(9.f*3.141592654f)); vec_float4 x, y, x2, y2, m2, vxp; vec_uint4 cmp, inc; vec_uint4 vi; vec_uint4 *p, *b; vec_float4 co; /* Process the full image. As there are 6 SPUs working in parallel, each with * a different rank from 0 to 5, each SPU processes only the line numbers: * rank, rank+6, rank+12, ... * The program uses a SPU DMA programming technique known as "double buffering", * where the previously generated line is transmitted to main memory while we * compute the next one, hence the need for a local buffer containing two lines. */ for (row = spu.rank; row < spu.height; row += spu.count) { /* Pixel buffer address (in local memory) of the next line to be drawn */ b = p = buf + ((1920/4)&-tag); vxp = vxbeg; /* first four x coordinates */ /* Process a whole screen line by packets of 4 pixels */ for (col = spu.width/4; col > 0 ; col--) { vi = spu_splats(0u); x = vxp; y = vyp; i = 0; cmp = spu_splats(-1u); inc = spu_splats(1u); m2 = zero; /* This loop processes the Mandelbrot suite for the four complex numbers * whose real part are the components of the x vector, and the imaginary * part are in y (as we process the same line, all initial values of y * are equal). * We perform loop unrolling for SPU performance optimization reasons, * hence the 4x replication of the same computation block. */ do { x2 = x*x; y2 = y*y; m2 = spu_sel(m2, x2+y2, cmp); cmp = spu_cmpgt(esc2, m2); inc = spu_and(inc, cmp); /* increment the iteration count only if */ vi = vi + inc; /* we're still inside the bailout radius */ y = two*x*y + vyp; x = x2-y2 + vxp; x2 = x*x; y2 = y*y; m2 = spu_sel(m2, x2+y2, cmp); cmp = spu_cmpgt(esc2, m2); inc = spu_and(inc, cmp); vi = vi + inc; y = two*x*y + vyp; x = x2-y2 + vxp; x2 = x*x; y2 = y*y; m2 = spu_sel(m2, x2+y2, cmp); cmp = spu_cmpgt(esc2, m2); inc = spu_and(inc, cmp); vi = vi + inc; y = two*x*y + vyp; x = x2-y2 + vxp; x2 = x*x; y2 = y*y; m2 = spu_sel(m2, x2+y2, cmp); cmp = spu_cmpgt(esc2, m2); inc = spu_and(inc, cmp); vi = vi + inc; y = two*x*y + vyp; x = x2-y2 + vxp; i += 4; } /* Exit the loop only if the iteration limit of 128 has been reached, * or all current four points are outside the bailout radius. * The __builtin_expect(xxx, 1) construct hints the compiler that the xxx * test has greater chance of being true (1), so a branch hinting * instruction is inserted into the binary code to make the conditional * branch faster in most cases (except the last one when we exit the * loop). This results in performance increase. */ while (__builtin_expect((i < 128) & (si_to_int((qword)spu_gather(cmp)) != 0), 1)); /* smooth coloring: compute the fractional part */ co = spu_convtf(vi, 0) + spu_splats(1.f); co -= fast_logf(fast_logf(m2) * spu_splats(.5f)); #if BAILBITS != 1 co = spu_re(spu_rsqrte(co*esc21)); #endif /* Compute the red, green an blue pixel components */ vec_uint4 cr = spu_convtu(mcos(co * ccr) * colsc, 0); vec_uint4 cg = spu_convtu(mcos(co * ccg) * colsc, 0); vec_uint4 cb = spu_convtu(mcos(co * ccb) * colsc, 0); /* Put the 4 pixel values in the buffer */ *p++ = (spu_sl(cr, 16) | spu_sl(cg, 8) | cb) & ~-inc; vxp += xstep; } /* double-buffered dma: initiate a dma transfer of last computed scanline * then wait for completion of the second last transfer (previous computed * line). This is done by changing the tag value. */ mfc_put(b, buf_ea+(spu.width*4)*row, spu.width*4, tag, 0, 0); tag = 1 - tag; wait_for_completion(tag); vyp += vinc; } /* wait for completion of last sent image line */ wait_for_completion(1-tag); }
void MinMaxBinFindBest3SIMD(minmaxbin_t *mmb, kdbuffer_t *result) { int i; for(i=1; i < mmb->numbins; i++) { int j = mmb->numbins - i - 1; vector float *min = (vector float *)mmb->minbins[i].b; vector float *max = (vector float *)mmb->maxbins[j].b; min[0] = spu_add(min[0], min[-1]); max[0] = spu_add(max[0], max[1]); } vector float *vmax = (vector float*)result->baabb.max; vector float *vmin = (vector float*)result->baabb.min; vector float vwidth = spu_abs( spu_sub(*vmax, *vmin) ); vector float vnumbins = spu_splats(1/(float)mmb->numbins); vector float vdelta = spu_mul(vwidth, vnumbins); vector float vx = spu_add(*vmin, vdelta); vector float vside = { vwidth[1] * vwidth[2], vwidth[0] * vwidth[2], vwidth[0] * vwidth[1], 0 }; vector float invarea = spu_splats( 1/(vwidth[0] * vside[0])); vector float vctravers = spu_splats(2.0f); vector float vbestcost = spu_splats(mmb->bestcost); vector int vbesti = spu_splats(0); vector float vbestx = vx; for(i=0; i < mmb->numbins-1; i++) { vector float aleft, aright; AreaLeftRight(*vmin, *vmax, vside, vx, &aleft, &aright); vector float *vminbin = (vector float *)mmb->minbins[i].b; vector float *vmaxbin = (vector float *)mmb->maxbins[i+1].b; vector float cost = SAHCostSIMD(invarea, vctravers, *vminbin, aleft, *vmaxbin, aright); vector unsigned int cmp = spu_cmpgt(cost, vbestcost); vbestcost = spu_sel(cost, vbestcost, cmp); vbesti = spu_sel(spu_splats(i), vbesti, cmp); vbestx = spu_sel(vx, vbestx, cmp); vx = spu_add(vx, vdelta); } int axis = 0; float bestcost = vbestcost[axis]; if(vbestcost[1] < bestcost) { axis = 1; bestcost = vbestcost[1]; } if(vbestcost[2] < bestcost) { axis = 2; bestcost = vbestcost[2]; } int index = vbesti[axis]; result->plane = vbestx[axis]; result->axis = axis; result->left_size = (int)mmb->minbins[ index ].b[axis]; result->right_size = (int)mmb->maxbins[ index+1 ].b[axis]; mmb->bestcost = vbestcost[axis]; }
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks) { const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) { // merge lo bytes from unsigned shorts (array) 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) { // get busy flag with ones in unused bytes 0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0 }; const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0); char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ]; char sync_buffer[128+127]; void* aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 ); RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer; unsigned long long cache_ea; spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); while (cache_ea) { // terminate immediately if possible if (spu_stat_in_mbox()) return; // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); unsigned int endTriangle = cache->endTriangle; vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle); // first look for short chunks vec_uchar16 next = cache->chunkNext; vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 start0 = cache->chunkStart[0]; vec_ushort8 start1 = cache->chunkStart[1]; vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) ); vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) ); vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0); vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1); vec_ushort8 len0 = spu_sub( end0, start0); vec_ushort8 len1 = spu_sub( end1, start1); vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0); vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1); vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE ); vec_uint4 smallChunkGather = spu_gather(small); // check to see if chunk is already at the last triangle vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle( (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]), (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]), SHUFFLE_MERGE_BYTES) ); // check if the chunk is free vec_uint4 freeChunkGather = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); // check to see if the chunk is being processed vec_uint4 busyChunkGather = spu_gather( spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK), spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) ); // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0 // note that if freeChunkGather is true then busyChunkGather must also be true // done=false, free=false, busy=false -> can process // free=false, busy=false -> can be merged // decide which chunk to process vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather ); vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather ); vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) ); vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask ); /* if (!spu_extract(shortSelMask, 0)) printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n", spu_extract(mayProcessGather, 0) & 0xffff, spu_extract(smallChunkGather, 0), spu_extract(mayProcessShortGather, 0), spu_extract(shortSelMask, 0) & 0xffff, spu_extract(mayProcessSelection, 0) & 0xffff ); */ vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16); unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0); unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0); // if there's nothing to process, try the next cache line in the rendering tasks list if (!spu_extract(mayProcessBits, 0)) { trynextcacheline: cache_ea = cache->next; // sleep(); continue; } unsigned int chunkStart = cache->chunkStartArray [chunkToProcess]; unsigned int chunkTriangle = cache->chunkTriangleArray[chunkToProcess]; unsigned int chunkNext = cache->chunkNextArray [chunkToProcess] & CHUNKNEXT_MASK; unsigned int chunkEnd = (cache->chunkStartArray [chunkNext]-1) & (NUMBER_OF_TILES-1); unsigned int chunkLength = 1 + chunkEnd-chunkStart; // only need an extra block if the block is especially long if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) { freeChunk = 32; } // mark this block as busy cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT; // if there's at least one free chunk, claim it if (freeChunk != 32) { cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED; cache->chunkTriangleArray[freeChunk] = chunkTriangle; } // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) continue; #ifdef INFO printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID, chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle, freeChunk!=32 ? freeChunk : -1 ); // debug_render_tasks(cache); #endif Triangle* triangle; int firstTile; do { // read the triangle data for the current triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // get the triangle deltas firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd); if (firstTile>=0) break; // no match, try next triangle chunkTriangle = triangle->next_triangle; } while (chunkTriangle != endTriangle); // if we actually have something to process... if (firstTile>=0) { // the "normal" splitting will now become: // chunkStart .. (firstTile-1) -> triangle->next_triangle // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY) // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE) int tailChunk; int thisChunk; int nextBlockStart; int thisBlockStart; int realBlockStart; do { retry: // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // calculate start of next block nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK; if (nextBlockStart > chunkEnd) nextBlockStart = chunkEnd+1; // calculate start of block to mark as busy thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK; if (thisBlockStart < chunkStart) thisBlockStart = chunkStart; realBlockStart = thisBlockStart; #ifdef INFO printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID, nextBlockStart, realBlockStart, thisBlockStart, chunkStart); #endif // allocate some more free chunks vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq( spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16); unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); if (freeChunk == 32) { // if we didn't have one before, try again freeChunk = freeChunk2; // and try to get the second one freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) ); freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); } else { // speculatively clear the free chunk just in case we don't need it cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK; } #ifdef INFO printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n", _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart ); #endif // mark region after as available for processing if required if (nextBlockStart < chunkEnd) { if (freeChunk==32) { // if no free chunk, relinquish entire block and write back cache->chunkNextArray[chunkToProcess] = chunkNext; spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); // if writeback failed, we *might* have a free block, retry if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) goto retry; // otherwise give up and try the next cache line goto trynextcacheline; } cache->chunkStartArray[freeChunk] = nextBlockStart; cache->chunkNextArray[freeChunk] = chunkNext; cache->chunkTriangleArray[freeChunk] = chunkTriangle; cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT; tailChunk = freeChunk; #ifdef INFO printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess); debug_render_tasks(cache); #endif } else { // we're gonna use freeChunk2 for the "in front" block, as we've not // used freeChunk, let's use it as it's more likely to have a free chunk freeChunk2 = freeChunk; tailChunk = chunkNext; } // mark region before as available if required and possible thisChunk = chunkToProcess; if (thisBlockStart > chunkStart) { if (freeChunk2 != 32) { // mark this region as busy cache->chunkStartArray[freeChunk2]=thisBlockStart; cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT; cache->chunkTriangleArray[freeChunk2]=chunkTriangle; // mark region before as available for processing cache->chunkNextArray[chunkToProcess]=freeChunk2; cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle; thisChunk = freeChunk2; #ifdef INFO printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #endif } else { // need to keep whole block, update info and mark bust cache->chunkTriangleArray[chunkToProcess]=chunkTriangle; cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT; realBlockStart = chunkStart; printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #ifdef INFO #endif sleep(); } } // merge chunks merge_cache_blocks(cache); // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS); // finally after the write succeeded, update the variables chunkNext = tailChunk; chunkToProcess = thisChunk; chunkStart = firstTile; //thisBlockStart; chunkLength = nextBlockStart - firstTile; chunkEnd = chunkStart + chunkLength - 1; freeChunk = 32; // now we can process the block up to endTriangle initTileBuffers(thisBlockStart, chunkEnd); int ok=0; while (chunkTriangle != endTriangle) { #ifdef INFO printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n", _SPUID, chunkToProcess, chunkStart, chunkLength, chunkTriangle, firstTile, thisBlockStart); #endif // and actually process that triangle on these chunks processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok); ok=1; #ifdef PAUSE sleep(); #endif // and advance to the next-triangle chunkTriangle = triangle->next_triangle; // this should only ever happen if we're running really low on cache line slots // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles. // in this case, we process from thisBlockStart only (because we know that from // chunkStart to there has no result) and then we only process one triangle if (chunkStart != realBlockStart) { /* printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, " "firstTile=%d chunk=%d\n", _SPUID, chunkStart, realBlockStart, chunkEnd, firstTile, chunkToProcess); debug_render_tasks(cache); */ // abort the while loop break; } // read the next triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // until chunkTriangle == endTriangle // flush any output buffers flushTileBuffers(thisBlockStart, chunkEnd); } // firstTile>=0
static inline vec_uint4 vec_Maj(vec_uint4 x, vec_uint4 y, vec_uint4 z) { return spu_sel(spu_and(y, z), spu_or(y, z), x); }
void merge_buffers(){ vector unsigned int cmp_v, cmp_v2; const vector signed int one_at_0 = {1,0,0,0}; const vector signed int one_at_1 = {0,1,0,0}; const vector signed int one_at_2 = {0,0,1,0}; const vector signed int ones = {1,1,1,1}; const vector signed int zeros = {0,0,0,0}; const vector unsigned char cmp_v_shuffle_mask = {31,31,31,31, 31,31,31,31, 31,31,31,31, 31,31,31,31}; vector unsigned char rev_mask; const vector unsigned char rev_left = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; const vector unsigned char rev_right = {28,29,30,31, 24,25,26,27, 20,21,22,23, 16,17,18,19}; vector signed int *out_head_idx; if(mcb[am].local[OUT] < 255){ int parent_idx = mcb[am].local[OUT]; int side = (mcb[am].id+1)&1; out_head_idx = (vector signed int*) &md[parent_idx].idx[side][HEAD]; } else { out_head_idx = (vector signed int*) &md[am].idx[OUT][HEAD]; } vector signed int *left_tail_idx = (vector signed int*) &md[am].idx[LEFT][TAIL]; vector signed int *right_tail_idx = (vector signed int*) &md[am].idx[RIGHT][TAIL]; vector signed int size_v = {mcb[am].buffer_size[LEFT], mcb[am].buffer_size[RIGHT], mcb[am].buffer_size[OUT], 0}; vector signed int avail_v = {num_in_buffer(LEFT), num_in_buffer(RIGHT), num_free_in_buffer(OUT), 1}; vector signed int avail_before = { spu_extract(avail_v, 0), spu_extract(avail_v, 1), 0, 0 }; vector unsigned int avail = spu_gather( spu_cmpgt(avail_v, zeros) ); // avail = 0x0F if all avail_v > zeros vector signed int *left, *right, *out; left = (vector signed int*) &md[am].buffer[LEFT][ spu_extract(*left_tail_idx,0) ]; right = (vector signed int*) &md[am].buffer[RIGHT][ spu_extract(*right_tail_idx,0) ]; out = (vector signed int*) &md[am].buffer[OUT][ spu_extract(*out_head_idx,0) ]; #ifdef TRACE_TIME dec_val2 = spu_read_decrementer(); #endif while(spu_extract(avail,0) == 0x0F){ // cmp left and right to determine who gets eaten cmp_v = spu_cmpgt(*left,*right); cmp_v = spu_shuffle(cmp_v, cmp_v, cmp_v_shuffle_mask); // cmp_v = {FFFF,FFFF,FFFF,FFFF} if left[3] > right[3] *out = spu_sel(*left,*right,cmp_v); rev_mask = spu_sel(rev_right,rev_left,(vector unsigned char)cmp_v); *left = spu_shuffle(*left,*right,rev_mask); // data to be sorted is now in out and left, left in descending order sort_vectors(out,left); // update index of the used side if( spu_extract(cmp_v,0) ){ // left[3] > right[3] *right_tail_idx = spu_add(*right_tail_idx,ones); avail_v = spu_sub(avail_v, one_at_1); right++; // modulus hack cmp_v2 = spu_cmpeq(*right_tail_idx, size_v); if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){ *right_tail_idx = zeros; right = (vector signed int*) &md[am].buffer[RIGHT][0]; } } else { *right = *left; *left_tail_idx = spu_add(*left_tail_idx,ones); avail_v = spu_sub(avail_v, one_at_0); left++; // modulus hack cmp_v2 = spu_cmpeq(*left_tail_idx, size_v); if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){ *left_tail_idx = zeros; left = (vector signed int*) &md[am].buffer[LEFT][0]; } } // update out head idx *out_head_idx = spu_add(*out_head_idx,ones); avail_v = spu_sub(avail_v, one_at_2); out++; // modulus hack cmp_v2 = spu_cmpeq(*out_head_idx, size_v); if( __builtin_expect(spu_extract(cmp_v2,0),0) ){ out = (vector signed int*) &md[am].buffer[OUT][0]; *out_head_idx = zeros; } // is there data still available? avail = spu_gather(spu_cmpgt(avail_v, zeros)); } #ifdef TRACE_TIME merge_loop_ticks += -(spu_read_decrementer() - dec_val2); #endif // how much got produced? vector signed int consumed = spu_sub(avail_before, avail_v); int consumed_left = spu_extract(consumed, 0); int consumed_right = spu_extract(consumed, 1); if(consumed_left) update_tail(LEFT); if(consumed_right) update_tail(RIGHT); md[am].consumed[LEFT] += consumed_left; md[am].consumed[RIGHT] += consumed_right; if(md[am].consumed[LEFT] == mcb[am].data_size[LEFT]) md[am].depleted[LEFT] = 1; if(md[am].consumed[RIGHT] == mcb[am].data_size[RIGHT]) md[am].depleted[RIGHT] = 1; if(mcb[am].local[OUT] < 255 && md[am].depleted[LEFT] && md[am].depleted[RIGHT]){ md[am].done = 1; --num_active_mergers; } }
inline vector float spu_max(vector float a, vector float b) { return spu_sel( b, a, spu_cmpgt( a, b ) ); }
Triangle* getTriangleBuffer(Context* context) { // if we've already allocated a triangle buffer (and we're in the same context) if (context == _currentTriangleContext && _currentTriangle) return _currentTriangle; // trash the default values _currentTriangleContext = context; _currentTriangle = NULL; // read the current renderable cache line to ensure there is room for the triangle data // in the cache line buffer; we do this by comparing against all 16 cache line blocks // to make sure that extending the write pointer wouldn't clobber the data unsigned long long cache_ea = context->renderableCacheLine; if (cache_ea == 0) return NULL; char cachebuffer[128+127]; RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 ); // printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea)); spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // extendvalid = ( read<=write && test<end ) || ( read>write && test<read ) // extendvalid = ( read>write && read>test ) || ( read<=write && end>test ) // simplifies to extendvalid = selb(end, read, read>write) > test // or extendvalid = selb(end>test, read>test, read>write) // rewind = next >= end // rewindvalid = read != 0 // valid = extendvalid && (!rewind || rewindvalid) // = extendvalid && (!rewind || !rewindinvalid) // = extendvalid && !(rewind && rewindinvalid) // invalid = ! (extendvalid && !(rewind && rewindinvalid)) // = (!extendvalid || (rewind && rewindinvalid)) vec_ushort8 v_writeptr = spu_splats( cache->endTriangle ); vec_ushort8 v_readptr0 = cache->chunkTriangle[0]; vec_ushort8 v_readptr1 = cache->chunkTriangle[1]; vec_ushort8 v_testptr = spu_add(v_writeptr, TRIANGLE_MAX_SIZE); vec_ushort8 v_nextptr = spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE); vec_ushort8 v_endptr = spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE); vec_ushort8 v_zero = spu_splats( (unsigned short) 0 ); vec_uchar16 v_merger = (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; vec_ushort8 v_max0_test = spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) ); vec_ushort8 v_max1_test = spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) ); vec_ushort8 v_extend0_valid = spu_cmpgt( v_max0_test, v_testptr ); vec_ushort8 v_extend1_valid = spu_cmpgt( v_max1_test, v_testptr ); vec_ushort8 v_rewind0_invalid = spu_cmpeq( v_readptr0, v_zero ); vec_ushort8 v_rewind1_invalid = spu_cmpeq( v_readptr1, v_zero ); vec_ushort8 v_rewind8 = spu_cmpgt( v_nextptr, v_endptr ); vec_uchar16 v_extend_valid = (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger ); vec_uchar16 v_rewind_invalid = (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger ); vec_uchar16 v_rewind = (vec_uchar16) v_rewind8; vec_uchar16 v_valid_rhs = spu_and( v_rewind_invalid, v_rewind ); vec_uchar16 v_invalid = spu_orc( v_valid_rhs, v_extend_valid ); // check to see if the chunk is being processed vec_uint4 v_free = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); vec_uint4 v_invalid_bits = spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free ); // if any of the bits are invalid, then no can do if ( spu_extract(v_invalid_bits, 0) ) { return NULL; } // fetch in the data before this triangle in the cache buffer unsigned int offset = cache->endTriangle; _currentTriangleBufferExtra = offset & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127); if (_currentTriangleBufferExtra) { spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD); // ensure DMA did actually complete mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // final bit of initialisation _currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra); _currentTriangleOffset = offset; _currentTriangleRewind = v_rewind8; _currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache)); _currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); _currentTriangleBufferEA = trianglebuffer_ea; // printf("Allocated new triangle buffer: %x\n", offset); // and return the buffer ready to go return _currentTriangle; }
inline vector float spu_min(vector float a, vector float b) { return spu_sel( a, b, spu_cmpgt( a, b ) ); }