vec_ullong2 cmpnegzerod2( vec_double2 x ) { vec_ullong2 cmp; vec_uchar16 even = (vec_uchar16)(vec_uint4){ 0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b }; vec_uchar16 odd = (vec_uchar16)(vec_uint4){ 0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f }; cmp = (vec_ullong2)spu_cmpeq( (vec_int4)x, (vec_int4)spu_splats(0x8000000000000000ull) ); cmp = spu_and( spu_shuffle( cmp, cmp, even ), spu_shuffle( cmp, cmp, odd ) ); return cmp; }
inline void merge_cache_blocks(RenderableCacheLine* cache) { vec_uchar16 next = cache->chunkNext; for (;;) { vec_uchar16 nextnext = spu_shuffle(next, next, next); vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); vec_ushort8 firstblock0 = spu_cmpeq( cache->chunkStart[0], 0); vec_ushort8 firstblock1 = spu_cmpeq( cache->chunkStart[1], 0); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 first = (vec_uchar16) spu_shuffle( firstblock0, firstblock1, firstshuf ); vec_ushort8 tri0 = cache->chunkTriangle[0]; vec_ushort8 tri1 = cache->chunkTriangle[1]; vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 ntri0 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 ntri1 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 trieq0 = spu_cmpeq( tri0, ntri0 ); vec_ushort8 trieq1 = spu_cmpeq( tri1, ntri1 ); vec_uchar16 trieq = (vec_uchar16) spu_shuffle( trieq0, trieq1, MERGE ); vec_uchar16 combi = spu_orc(first, trieq); vec_uchar16 canmerge = spu_cmpgt( spu_nor(spu_or(next, nextnext), combi), 256-CHUNKNEXT_BUSY_BIT ); vec_uint4 gather = spu_gather( canmerge ); vec_uint4 mergeid = spu_sub( spu_cntlz( gather ), spu_promote((unsigned int)16, 0)); if( !spu_extract(gather, 0) ) { return; } // unsigned int firstchunk = spu_extract(mergeid, 0); // unsigned int nextchunk = cache->chunkNextArray[firstchunk]; vec_uint4 v_chunkNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(mergeid,13) ); vec_uint4 v_chunkNextNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(v_chunkNext,13) ); // cache->chunkNextArray[firstchunk] = cache->chunkNextArray[nextchunk]; next = spu_shuffle( (vec_uchar16) v_chunkNextNext, next, (vec_uchar16) si_cbd( (qword) mergeid, 0 ) ); // cache->chunkNextArray[nextchunk] = CHUNKNEXT_FREE_BLOCK; next = spu_shuffle( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK), next, (vec_uchar16) si_cbd( (qword) v_chunkNext, 0 ) ); // this is for debug use only, it's not really needed... // cache->chunkStartArray[nextchunk] = -1; cache->chunkStartArray[ spu_extract(v_chunkNext,0) & 255 ] = -1; cache->chunkNext = next; } }
int kernel(lwp_functions* pf, void* params, void* inout, unsigned int iter, unsigned int n) { Ternary_params* p = (Ternary_params*)params; switch (p->cmd) { case AM: { int length = p->length / 4; vector float *a = (vector float *)inout; vector float *b = a + length; vector float *c = a + 2 * length; unsigned int i; for (i = 0; i != length; ++i, ++a, ++b, ++c) *a = spu_mul(spu_add(*a, *b), *c); return 0; } case MA: { int length = p->length / 4; vector float *a = (vector float *)inout; vector float *b = a + length; vector float *c = a + 2 * length; unsigned int i; for (i = 0; i != length; ++i, ++a, ++b, ++c) *a = spu_madd(*a, *b, *c); return 0; } case CAM: { static vector unsigned char lo = (vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; static vector unsigned char hi = (vector unsigned char) { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; int length = p->length / 4; float *a = (float *)inout; float *b = a + 8 * length; float *c = a + 16 * length; unsigned int i; // (a + b) * c: // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r for (i = 0; i != length; ++i, a+=8, b+=8, c+=8) { vector float av = {*a, *(a+2), *(a+4), *(a+6)}; // a.r vector float bv = {*b, *(b+2), *(b+4), *(b+6)}; // b.r vector float cv = {*c, *(c+2), *(c+4), *(c+6)}; // c.r vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)}; // a.i vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)}; // b.i vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)}; // c.i vector float trv = spu_add(av, bv); // a.r+b.r vector float tiv = spu_add(dv, ev); // a.i+b.i vector float sv = spu_mul(trv, cv); // (a.r+b.r)*c.r vector float tv = spu_mul(trv, fv); // (a.r+b.r)*c.i vector float real = spu_nmsub(tiv, fv, sv); // r.r vector float imag = spu_madd(tiv, cv, tv); // r.i // interleave result *(vector float *)a = spu_shuffle(real, imag, lo); *(vector float *)(a+4) = spu_shuffle(real, imag, hi); } return 0; } case CMA: { static vector unsigned char lo = (vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; static vector unsigned char hi = (vector unsigned char) { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; int length = p->length / 4; float *a = (float *)inout; float *b = a + 8 * length; float *c = a + 16 * length; unsigned int i; // a * b + c: // r.r = a.r*b.r + c.r - a.i*b.i // r.i = a.r*b.i + c.i + a.i*b.r for (i = 0; i != length; ++i, a+=8, b+=8, c+=8) { vector float av = {*a, *(a+2), *(a+4), *(a+6)}; // a.r vector float bv = {*b, *(b+2), *(b+4), *(b+6)}; // b.r vector float cv = {*c, *(c+2), *(c+4), *(c+6)}; // c.r vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)}; // a.i vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)}; // b.i vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)}; // c.i vector float real = spu_nmsub(dv, ev, spu_madd(av, bv, cv)); // r.r vector float imag = spu_madd(dv, bv, spu_madd(av, ev, fv)); // r.i // interleave result *(vector float *)a = spu_shuffle(real, imag, lo); *(vector float *)(a+4) = spu_shuffle(real, imag, hi); } return 0; } case ZAM: { int length = p->length / 4; float *a_re = (float *)inout; float *a_im = a_re + 4 * length; float *b_re = a_re + 8 * length; float *b_im = a_re + 12 * length; float *c_re = a_re + 16 * length; float *c_im = a_re + 20 * length; unsigned int i; // (a + b) * c: // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r for (i = 0; i != length; ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4) { vector float *av = (vector float *)a_re; vector float *bv = (vector float *)b_re; vector float *cv = (vector float *)c_re; vector float *dv = (vector float *)a_im; vector float *ev = (vector float *)b_im; vector float *fv = (vector float *)c_im; vector float trv = spu_add(*av, *bv); // a.r+b.r vector float tiv = spu_add(*dv, *ev); // a.i+b.i vector float sv = spu_mul(trv, *cv); // (a.r+b.r)*c.r vector float tv = spu_mul(trv, *fv); // (a.r+b.r)*c.i *av = spu_nmsub(tiv, *fv, sv); // r.r *dv = spu_madd(tiv, *cv, tv); // r.i } return 0; } case ZMA: { int length = p->length / 4; float *a_re = (float *)inout; float *a_im = a_re + 4 * length; float *b_re = a_re + 8 * length; float *b_im = a_re + 12 * length; float *c_re = a_re + 16 * length; float *c_im = a_re + 20 * length; unsigned int i; // a * b + c: // r.r = a.r*b.r + c.r - a.i*b.i // r.i = a.r*b.i + c.i + a.i*b.r for (i = 0; i != length; ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4) { vector float *av = (vector float *)a_re; vector float *bv = (vector float *)b_re; vector float *cv = (vector float *)c_re; vector float *dv = (vector float *)a_im; vector float *ev = (vector float *)b_im; vector float *fv = (vector float *)c_im; vector float tmp = spu_nmsub(*dv, *ev, spu_madd(*av, *bv, *cv)); *dv = spu_madd(*dv, *bv, spu_madd(*av, *ev, *fv)); *av = tmp; } return 0; } } return 1; }
/** * Setup fragment shader inputs by evaluating triangle's vertex * attribute coefficient info. * \param x quad x pos * \param y quad y pos * \param fragZ returns quad Z values * \param fragInputs returns fragment program inputs * Note: this code could be incorporated into the fragment program * itself to avoid the loop and switch. */ static void eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[]) { static const vector float deltaX = (const vector float) {0, 1, 0, 1}; static const vector float deltaY = (const vector float) {0, 0, 1, 1}; const uint posSlot = 0; const vector float pos = setup.coef[posSlot].a0; const vector float dposdx = setup.coef[posSlot].dadx; const vector float dposdy = setup.coef[posSlot].dady; const vector float fragX = spu_splats(x) + deltaX; const vector float fragY = spu_splats(y) + deltaY; vector float fragW, wInv; uint i; *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy); fragW = splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy); wInv = spu_re(fragW); /* 1 / w */ /* loop over fragment program inputs */ for (i = 0; i < spu.vertex_info.num_attribs; i++) { uint attr = i + 1; enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode; /* constant term */ vector float a0 = setup.coef[attr].a0; vector float r0 = splatx(a0); vector float r1 = splaty(a0); vector float r2 = splatz(a0); vector float r3 = splatw(a0); if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) { /* linear term */ vector float dadx = setup.coef[attr].dadx; vector float dady = setup.coef[attr].dady; /* Use SPU intrinsics here to get slightly better code. * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady); */ r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0)); r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1)); r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2)); r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3)); if (interp == INTERP_PERSPECTIVE) { /* perspective term */ r0 *= wInv; r1 *= wInv; r2 *= wInv; r3 *= wInv; } } fragInputs[CHAN0] = r0; fragInputs[CHAN1] = r1; fragInputs[CHAN2] = r2; fragInputs[CHAN3] = r3; fragInputs += 4; } } /** * Emit a quad (pass to next stage). No clipping is done. * Note: about 1/5 to 1/7 of the time, mask is zero and this function * should be skipped. But adding the test for that slows things down * overall. */ static INLINE void emit_quad( int x, int y, mask_t mask) { /* If any bits in mask are set... */ if (spu_extract(spu_orx(mask), 0)) { const int ix = x - setup.cliprect_minx; const int iy = y - setup.cliprect_miny; spu.cur_ctile_status = TILE_STATUS_DIRTY; spu.cur_ztile_status = TILE_STATUS_DIRTY; { /* * Run fragment shader, execute per-fragment ops, update fb/tile. */ vector float inputs[4*4], outputs[2*4]; vector unsigned int kill_mask; vector float fragZ; eval_inputs((float) x, (float) y, &fragZ, inputs); ASSERT(spu.fragment_program); ASSERT(spu.fragment_ops); /* Execute the current fragment program */ kill_mask = spu.fragment_program(inputs, outputs, spu.constants); mask = spu_andc(mask, kill_mask); /* Execute per-fragment/quad operations, including: * alpha test, z test, stencil test, blend and framebuffer writing. * Note that there are two different fragment operations functions * that can be called, one for front-facing fragments, and one * for back-facing fragments. (Often the two are the same; * but in some cases, like two-sided stenciling, they can be * very different.) So choose the correct function depending * on the calculated facing. */ spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile, fragZ, outputs[0*4+0], outputs[0*4+1], outputs[0*4+2], outputs[0*4+3], mask); } } } /** * Given an X or Y coordinate, return the block/quad coordinate that it * belongs to. */ static INLINE int block(int x) { return x & ~1; } /** * Render a horizontal span of quads */ static void flush_spans(void) { int minleft, maxright; const int l0 = spu_extract(setup.span.quad, 0); const int l1 = spu_extract(setup.span.quad, 1); const int r0 = spu_extract(setup.span.quad, 2); const int r1 = spu_extract(setup.span.quad, 3); switch (setup.span.y_flags) { case 0x3: /* both odd and even lines written (both quad rows) */ minleft = MIN2(l0, l1); maxright = MAX2(r0, r1); break; case 0x1: /* only even line written (quad top row) */ minleft = l0; maxright = r0; break; case 0x2: /* only odd line written (quad bottom row) */ minleft = l1; maxright = r1; break; default: return; } /* OK, we're very likely to need the tile data now. * clear or finish waiting if needed. */ if (spu.cur_ctile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ctile\n", spu.init.id); wait_on_mask(1 << TAG_READ_TILE_COLOR); spu.cur_ctile_status = TILE_STATUS_CLEAN; } else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) { //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty); clear_c_tile(&spu.ctile); spu.cur_ctile_status = TILE_STATUS_DIRTY; } ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); if (spu.read_depth_stencil) { if (spu.cur_ztile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ztile\n", spu.init.id); wait_on_mask(1 << TAG_READ_TILE_Z); spu.cur_ztile_status = TILE_STATUS_CLEAN; } else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) { //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty); clear_z_tile(&spu.ztile); spu.cur_ztile_status = TILE_STATUS_DIRTY; } ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED); } /* XXX this loop could be moved into the above switch cases... */ /* Setup for mask calculation */ const vec_int4 quad_LlRr = setup.span.quad; const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8); const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B)); const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B)); const vec_int4 twos = spu_splats(2); const int x = block(minleft); vec_int4 xs = {x, x+1, x, x+1}; for (; spu_extract(xs, 0) <= block(maxright); xs += twos) { /** * Computes mask to indicate which pixels in the 2x2 quad are actually * inside the triangle's bounds. */ /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */ const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs); const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */ const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs); /* Combine results to create mask */ const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs); emit_quad(spu_extract(xs, 0), setup.span.y, mask); } setup.span.y = 0; setup.span.y_flags = 0; /* Zero right elements */ setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0)); } #if DEBUG_VERTS static void print_vertex(const struct vertex_header *v) { uint i; fprintf(stderr, " Vertex: (%p)\n", v); for (i = 0; i < spu.vertex_info.num_attribs; i++) { fprintf(stderr, " %d: %f %f %f %f\n", i, spu_extract(v->data[i], 0), spu_extract(v->data[i], 1), spu_extract(v->data[i], 2), spu_extract(v->data[i], 3)); } }
vector double __divv2df3 (vector double a_in, vector double b_in) { /* Variables */ vec_int4 exp, exp_bias; vec_uint4 no_underflow, overflow; vec_float4 mant_bf, inv_bf; vec_ullong2 exp_a, exp_b; vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0; vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0; vec_ullong2 nan; vec_uint4 a_exp, b_exp; vec_ullong2 a_mant_0, b_mant_0; vec_ullong2 a_exp_1s, b_exp_1s; vec_ullong2 sign_exp_mask; vec_double2 a, b; vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult; /* Constants */ vec_uint4 exp_mask_u32 = spu_splats((unsigned int)0x7FF00000); vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8, 9,10,11, 8,9,10,11 }; vec_uchar16 swap_32 = (vec_uchar16) { 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL); vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL); vec_float4 onef = spu_splats(1.0f); vec_double2 one = spu_splats(1.0); vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL); sign_exp_mask = spu_or(sign_mask, exp_mask); /* Extract the floating point components from each of the operands including * exponent and mantissa. */ a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32); a_exp = spu_shuffle(a_exp, a_exp, splat_hi); b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32); b_exp = spu_shuffle(b_exp, b_exp, splat_hi); a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0); a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32)); b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0); b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32)); a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32); b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32); /* Identify all possible special values that must be accommodated including: * +-denorm, +-0, +-infinity, and NaNs. */ a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0); a_nan = spu_andc(a_exp_1s, a_mant_0); a_zero = spu_and (a_denorm0, a_mant_0); a_inf = spu_and (a_exp_1s, a_mant_0); a_denorm = spu_andc(a_denorm0, a_zero); b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0); b_nan = spu_andc(b_exp_1s, b_mant_0); b_zero = spu_and (b_denorm0, b_mant_0); b_inf = spu_and (b_exp_1s, b_mant_0); b_denorm = spu_andc(b_denorm0, b_zero); /* Scale denorm inputs to into normalized numbers by conditionally scaling the * input parameters. */ a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask)); a = spu_sel(a_in, a, a_denorm); b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask)); b = spu_sel(b_in, b, b_denorm); /* Extract the divisor and dividend exponent and force parameters into the signed * range [1.0,2.0) or [-1.0,2.0). */ exp_a = spu_and((vec_ullong2)a, exp_mask); exp_b = spu_and((vec_ullong2)b, exp_mask); mant_a = spu_sel(a, one, (vec_ullong2)exp_mask); mant_b = spu_sel(b, one, (vec_ullong2)exp_mask); /* Approximate the single reciprocal of b by using * the single precision reciprocal estimate followed by one * single precision iteration of Newton-Raphson. */ mant_bf = spu_roundtf(mant_b); inv_bf = spu_re(mant_bf); inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf); /* Perform 2 more Newton-Raphson iterations in double precision. The * result (q1) is in the range (0.5, 2.0). */ inv_b = spu_extend(inv_bf); inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b); q0 = spu_mul(mant_a, inv_b); q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0); /* Determine the exponent correction factor that must be applied * to q1 by taking into account the exponent of the normalized inputs * and the scale factors that were applied to normalize them. */ exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20); exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34))); /* Bias the quotient exponent depending on the sign of the exponent correction * factor so that a single multiplier will ensure the entire double precision * domain (including denorms) can be achieved. * * exp bias q1 adjust exp * ===== ======== ========== * positive 2^+65 -65 * negative 2^-64 +64 */ exp_bias = spu_xor(spu_rlmaska(exp, -31), 64); exp = spu_sub(exp, exp_bias); q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask); /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the * expected result. On overflow, clamp the multiplier to the maximum non-infinite * number in case the rounding mode is not round-to-nearest. */ exp = spu_add(exp, 0x3FF); no_underflow = spu_cmpgt(exp, 0); overflow = spu_cmpgt(exp, 0x7FE); exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow); exp = spu_and(exp, (vec_int4)exp_mask); mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow); /* Handle special value conditions. These include: * * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN * results. * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results. * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results. */ mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf)); mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero)); nan = spu_or(a_nan, b_nan); nan = spu_or(nan, spu_and(a_zero, b_zero)); nan = spu_or(nan, spu_and(a_inf, b_inf)); mult = spu_or(mult, (vec_double2)nan); /* Scale the final quotient */ q2 = spu_mul(q1, mult); return (q2); }
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks) { const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) { // merge lo bytes from unsigned shorts (array) 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) { // get busy flag with ones in unused bytes 0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0 }; const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0); char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ]; char sync_buffer[128+127]; void* aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 ); RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer; unsigned long long cache_ea; spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); while (cache_ea) { // terminate immediately if possible if (spu_stat_in_mbox()) return; // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); unsigned int endTriangle = cache->endTriangle; vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle); // first look for short chunks vec_uchar16 next = cache->chunkNext; vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 start0 = cache->chunkStart[0]; vec_ushort8 start1 = cache->chunkStart[1]; vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) ); vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) ); vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0); vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1); vec_ushort8 len0 = spu_sub( end0, start0); vec_ushort8 len1 = spu_sub( end1, start1); vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0); vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1); vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE ); vec_uint4 smallChunkGather = spu_gather(small); // check to see if chunk is already at the last triangle vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle( (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]), (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]), SHUFFLE_MERGE_BYTES) ); // check if the chunk is free vec_uint4 freeChunkGather = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); // check to see if the chunk is being processed vec_uint4 busyChunkGather = spu_gather( spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK), spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) ); // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0 // note that if freeChunkGather is true then busyChunkGather must also be true // done=false, free=false, busy=false -> can process // free=false, busy=false -> can be merged // decide which chunk to process vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather ); vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather ); vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) ); vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask ); /* if (!spu_extract(shortSelMask, 0)) printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n", spu_extract(mayProcessGather, 0) & 0xffff, spu_extract(smallChunkGather, 0), spu_extract(mayProcessShortGather, 0), spu_extract(shortSelMask, 0) & 0xffff, spu_extract(mayProcessSelection, 0) & 0xffff ); */ vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16); unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0); unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0); // if there's nothing to process, try the next cache line in the rendering tasks list if (!spu_extract(mayProcessBits, 0)) { trynextcacheline: cache_ea = cache->next; // sleep(); continue; } unsigned int chunkStart = cache->chunkStartArray [chunkToProcess]; unsigned int chunkTriangle = cache->chunkTriangleArray[chunkToProcess]; unsigned int chunkNext = cache->chunkNextArray [chunkToProcess] & CHUNKNEXT_MASK; unsigned int chunkEnd = (cache->chunkStartArray [chunkNext]-1) & (NUMBER_OF_TILES-1); unsigned int chunkLength = 1 + chunkEnd-chunkStart; // only need an extra block if the block is especially long if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) { freeChunk = 32; } // mark this block as busy cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT; // if there's at least one free chunk, claim it if (freeChunk != 32) { cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED; cache->chunkTriangleArray[freeChunk] = chunkTriangle; } // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) continue; #ifdef INFO printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID, chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle, freeChunk!=32 ? freeChunk : -1 ); // debug_render_tasks(cache); #endif Triangle* triangle; int firstTile; do { // read the triangle data for the current triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // get the triangle deltas firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd); if (firstTile>=0) break; // no match, try next triangle chunkTriangle = triangle->next_triangle; } while (chunkTriangle != endTriangle); // if we actually have something to process... if (firstTile>=0) { // the "normal" splitting will now become: // chunkStart .. (firstTile-1) -> triangle->next_triangle // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY) // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE) int tailChunk; int thisChunk; int nextBlockStart; int thisBlockStart; int realBlockStart; do { retry: // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // calculate start of next block nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK; if (nextBlockStart > chunkEnd) nextBlockStart = chunkEnd+1; // calculate start of block to mark as busy thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK; if (thisBlockStart < chunkStart) thisBlockStart = chunkStart; realBlockStart = thisBlockStart; #ifdef INFO printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID, nextBlockStart, realBlockStart, thisBlockStart, chunkStart); #endif // allocate some more free chunks vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq( spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16); unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); if (freeChunk == 32) { // if we didn't have one before, try again freeChunk = freeChunk2; // and try to get the second one freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) ); freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); } else { // speculatively clear the free chunk just in case we don't need it cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK; } #ifdef INFO printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n", _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart ); #endif // mark region after as available for processing if required if (nextBlockStart < chunkEnd) { if (freeChunk==32) { // if no free chunk, relinquish entire block and write back cache->chunkNextArray[chunkToProcess] = chunkNext; spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); // if writeback failed, we *might* have a free block, retry if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) goto retry; // otherwise give up and try the next cache line goto trynextcacheline; } cache->chunkStartArray[freeChunk] = nextBlockStart; cache->chunkNextArray[freeChunk] = chunkNext; cache->chunkTriangleArray[freeChunk] = chunkTriangle; cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT; tailChunk = freeChunk; #ifdef INFO printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess); debug_render_tasks(cache); #endif } else { // we're gonna use freeChunk2 for the "in front" block, as we've not // used freeChunk, let's use it as it's more likely to have a free chunk freeChunk2 = freeChunk; tailChunk = chunkNext; } // mark region before as available if required and possible thisChunk = chunkToProcess; if (thisBlockStart > chunkStart) { if (freeChunk2 != 32) { // mark this region as busy cache->chunkStartArray[freeChunk2]=thisBlockStart; cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT; cache->chunkTriangleArray[freeChunk2]=chunkTriangle; // mark region before as available for processing cache->chunkNextArray[chunkToProcess]=freeChunk2; cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle; thisChunk = freeChunk2; #ifdef INFO printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #endif } else { // need to keep whole block, update info and mark bust cache->chunkTriangleArray[chunkToProcess]=chunkTriangle; cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT; realBlockStart = chunkStart; printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #ifdef INFO #endif sleep(); } } // merge chunks merge_cache_blocks(cache); // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS); // finally after the write succeeded, update the variables chunkNext = tailChunk; chunkToProcess = thisChunk; chunkStart = firstTile; //thisBlockStart; chunkLength = nextBlockStart - firstTile; chunkEnd = chunkStart + chunkLength - 1; freeChunk = 32; // now we can process the block up to endTriangle initTileBuffers(thisBlockStart, chunkEnd); int ok=0; while (chunkTriangle != endTriangle) { #ifdef INFO printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n", _SPUID, chunkToProcess, chunkStart, chunkLength, chunkTriangle, firstTile, thisBlockStart); #endif // and actually process that triangle on these chunks processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok); ok=1; #ifdef PAUSE sleep(); #endif // and advance to the next-triangle chunkTriangle = triangle->next_triangle; // this should only ever happen if we're running really low on cache line slots // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles. // in this case, we process from thisBlockStart only (because we know that from // chunkStart to there has no result) and then we only process one triangle if (chunkStart != realBlockStart) { /* printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, " "firstTile=%d chunk=%d\n", _SPUID, chunkStart, realBlockStart, chunkEnd, firstTile, chunkToProcess); debug_render_tasks(cache); */ // abort the while loop break; } // read the next triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // until chunkTriangle == endTriangle // flush any output buffers flushTileBuffers(thisBlockStart, chunkEnd); } // firstTile>=0
Triangle* getTriangleBuffer(Context* context) { // if we've already allocated a triangle buffer (and we're in the same context) if (context == _currentTriangleContext && _currentTriangle) return _currentTriangle; // trash the default values _currentTriangleContext = context; _currentTriangle = NULL; // read the current renderable cache line to ensure there is room for the triangle data // in the cache line buffer; we do this by comparing against all 16 cache line blocks // to make sure that extending the write pointer wouldn't clobber the data unsigned long long cache_ea = context->renderableCacheLine; if (cache_ea == 0) return NULL; char cachebuffer[128+127]; RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 ); // printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea)); spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // extendvalid = ( read<=write && test<end ) || ( read>write && test<read ) // extendvalid = ( read>write && read>test ) || ( read<=write && end>test ) // simplifies to extendvalid = selb(end, read, read>write) > test // or extendvalid = selb(end>test, read>test, read>write) // rewind = next >= end // rewindvalid = read != 0 // valid = extendvalid && (!rewind || rewindvalid) // = extendvalid && (!rewind || !rewindinvalid) // = extendvalid && !(rewind && rewindinvalid) // invalid = ! (extendvalid && !(rewind && rewindinvalid)) // = (!extendvalid || (rewind && rewindinvalid)) vec_ushort8 v_writeptr = spu_splats( cache->endTriangle ); vec_ushort8 v_readptr0 = cache->chunkTriangle[0]; vec_ushort8 v_readptr1 = cache->chunkTriangle[1]; vec_ushort8 v_testptr = spu_add(v_writeptr, TRIANGLE_MAX_SIZE); vec_ushort8 v_nextptr = spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE); vec_ushort8 v_endptr = spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE); vec_ushort8 v_zero = spu_splats( (unsigned short) 0 ); vec_uchar16 v_merger = (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; vec_ushort8 v_max0_test = spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) ); vec_ushort8 v_max1_test = spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) ); vec_ushort8 v_extend0_valid = spu_cmpgt( v_max0_test, v_testptr ); vec_ushort8 v_extend1_valid = spu_cmpgt( v_max1_test, v_testptr ); vec_ushort8 v_rewind0_invalid = spu_cmpeq( v_readptr0, v_zero ); vec_ushort8 v_rewind1_invalid = spu_cmpeq( v_readptr1, v_zero ); vec_ushort8 v_rewind8 = spu_cmpgt( v_nextptr, v_endptr ); vec_uchar16 v_extend_valid = (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger ); vec_uchar16 v_rewind_invalid = (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger ); vec_uchar16 v_rewind = (vec_uchar16) v_rewind8; vec_uchar16 v_valid_rhs = spu_and( v_rewind_invalid, v_rewind ); vec_uchar16 v_invalid = spu_orc( v_valid_rhs, v_extend_valid ); // check to see if the chunk is being processed vec_uint4 v_free = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); vec_uint4 v_invalid_bits = spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free ); // if any of the bits are invalid, then no can do if ( spu_extract(v_invalid_bits, 0) ) { return NULL; } // fetch in the data before this triangle in the cache buffer unsigned int offset = cache->endTriangle; _currentTriangleBufferExtra = offset & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127); if (_currentTriangleBufferExtra) { spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD); // ensure DMA did actually complete mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // final bit of initialisation _currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra); _currentTriangleOffset = offset; _currentTriangleRewind = v_rewind8; _currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache)); _currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); _currentTriangleBufferEA = trianglebuffer_ea; // printf("Allocated new triangle buffer: %x\n", offset); // and return the buffer ready to go return _currentTriangle; }
int main (unsigned long long spe_id, unsigned long long argp, unsigned long long envp) { unsigned int id; int i, j, bufindex; vector float temp[4]; /* this is a set of 2 16K buffers */ vector float buf[2][BLOCK][BLOCK/4] __attribute__ ((aligned(128))); vector float out[2][BLOCK][BLOCK/4] __attribute__ ((aligned(128))); vector unsigned char maskLeft = (vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17}; vector unsigned char maskRight = (vector unsigned char){0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f}; transpose_package_t package; /* location markers */ unsigned long long dataaddr = 0; int rowid, blockid, blockaddr, blockstart, row; int opporowid, oppoblockaddr; /* read in package */ mfc_get(&package, argp, sizeof(transpose_package_t), TAG, 0, 0); mfc_write_tag_mask(1<<TAG); mfc_read_tag_status_all(); id = package.id; blockstart = id * (N / THREADCNT / BLOCK) * BLOCK * sizeof(float); /* For each Row set (64 rows in a row set) * for each block * for each row in a block * read */ for (rowid = 0; rowid < N; rowid += BLOCK) { /* read in prebuf */ blockid = 0; blockaddr = blockstart + (blockid * sizeof(buf[0][0])); /* each rowset is 64 rows */ for (row = rowid; row < rowid + BLOCK; row++) { dataaddr = package.srcbuf + (row * N * sizeof(float)) + blockaddr; mfc_get( buf[blockid & 1][row % BLOCK], dataaddr, sizeof(buf[0][0]), 0, 0, 0); } /* each spu must walk 8 blocks per rowset */ for (blockid = 1; blockid < (N / THREADCNT / BLOCK); blockid++) { blockaddr = blockstart + (blockid * sizeof(buf[0][0])); /* each rowset is 64 rows */ for (row = rowid; row < rowid + BLOCK; row++) { dataaddr = package.srcbuf + (row * N * sizeof(float)) + blockaddr; mfc_get( buf[blockid & 1][row % BLOCK], dataaddr, sizeof(buf[0][0]), blockid & 1, 0, 0); } mfc_write_tag_mask(1 << (1 - (blockid & 1))); mfc_read_tag_status_all(); bufindex = (blockid & 1) ? 0 : 1; /* transpose the previous block */ for (i = 0; i < BLOCK; i+= 4) { for (j = 0; j < BLOCK / 4; j++) { /* first phase */ temp[0] = spu_shuffle( buf[bufindex][i][j], buf[bufindex][i+2][j], maskLeft); temp[1] = spu_shuffle( buf[bufindex][i][j], buf[bufindex][i+2][j], maskRight); temp[2] = spu_shuffle( buf[bufindex][i+1][j], buf[bufindex][i+3][j], maskLeft); temp[3] = spu_shuffle( buf[bufindex][i+1][j], buf[bufindex][i+3][j], maskRight); /* second phase */ out[bufindex][j*4][i/4] = spu_shuffle(temp[0], temp[2], maskLeft); out[bufindex][(j*4)+1][i/4] = spu_shuffle(temp[0], temp[2], maskRight); out[bufindex][(j*4)+2][i/4] = spu_shuffle(temp[1], temp[3], maskLeft); out[bufindex][(j*4)+3][i/4] = spu_shuffle(temp[1], temp[3], maskRight); } } /* calculating opposite location! */ oppoblockaddr = rowid * sizeof(float); blockaddr = blockstart + ((blockid - 1) * sizeof(buf[0][0])); opporowid = blockaddr / sizeof(float); /* write the block back out -> to the opposite location! */ for (row = opporowid; row < opporowid + BLOCK; row++) { dataaddr = package.destbuf + (row * N * sizeof(float)) + oppoblockaddr; mfc_put( out[1 - (blockid & 1)][row % BLOCK], dataaddr, sizeof(buf[0][0]), 1 - (blockid & 1), 0, 0); } } /* handle final block in row */ mfc_write_tag_mask(2); mfc_read_tag_status_all(); /* process remaining block */ bufindex = (blockid == 1) ? 0 : 1; /* transpose the previous block */ /* i indexes the row */ for (i = 0; i < BLOCK; i+=4) { /* j indexes the column */ for (j = 0; j < BLOCK / 4; j++) { /* first phase */ temp[0] = spu_shuffle( buf[bufindex][i][j], buf[bufindex][i+2][j], maskLeft); temp[1] = spu_shuffle( buf[bufindex][i][j], buf[bufindex][i+2][j], maskRight); temp[2] = spu_shuffle( buf[bufindex][i+1][j], buf[bufindex][i+3][j], maskLeft); temp[3] = spu_shuffle( buf[bufindex][i+1][j], buf[bufindex][i+3][j], maskRight); /* second phase */ out[bufindex][j*4][i/4] = spu_shuffle(temp[0], temp[2], maskLeft); out[bufindex][(j*4)+1][i/4] = spu_shuffle(temp[0], temp[2], maskRight); out[bufindex][(j*4)+2][i/4] = spu_shuffle(temp[1], temp[3], maskLeft); out[bufindex][(j*4)+3][i/4] = spu_shuffle(temp[1], temp[3], maskRight); } } /* calculating opposite for the previous block */ blockaddr = blockstart + ((blockid - 1) * sizeof(buf[0][0])); oppoblockaddr = rowid * sizeof(float); opporowid = blockaddr / sizeof(float); /* write the block back out -> to the opposite location! */ for (row = opporowid; row < opporowid + BLOCK; row++) { dataaddr = package.destbuf + (row * N * sizeof(float)) + oppoblockaddr; mfc_put( out[bufindex][row % BLOCK], dataaddr, sizeof(buf[0][0]), 1, 0, 0); } mfc_read_tag_status_all(); } return 0; }
/* * NAME: sha256->search() * DESCRIPTION: try to find a nonce which satisfies a target hash */ int64_t sha256_search(const message_t M, const hash_t target, const hash_t midstate, uint32_t start_nonce, uint32_t range) { uint32_t nonce, stop_nonce = start_nonce + range + (4 - (range % 4)) % 4; # if !defined(UNROLL_SHA256) int t; # endif vec_uint4 W0[3], a0, b0, c0, d0, e0, f0, g0, h0; vec_uint4 W[16], a, b, c, d, e, f, g, h, T1, T2; vec_uint4 borrow, solution; const vec_uchar16 reverse_endian = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; /* precompute first three rounds */ a = SPLAT(midstate.words[0]); b = SPLAT(midstate.words[1]); c = SPLAT(midstate.words[2]); d = SPLAT(midstate.words[3]); e = SPLAT(midstate.words[4]); f = SPLAT(midstate.words[5]); g = SPLAT(midstate.words[6]); h = SPLAT(midstate.words[7]); # ifdef UNROLL_SHA256 W[0] = SPLAT(M.words[0]); ROUND(0); W[1] = SPLAT(M.words[1]); ROUND(1); W[2] = SPLAT(M.words[2]); ROUND(2); # else for (t = 0; t < 3; ++t) { W[t] = SPLAT(M.words[t]); ROUND(t); } # endif W0[0] = W[0]; W0[1] = W[1]; W0[2] = W[2]; a0 = a; b0 = b; c0 = c; d0 = d; e0 = e; f0 = f; g0 = g; h0 = h; /* do the search, four at a time */ for (nonce = start_nonce; nonce != stop_nonce; nonce += 4) { W[0] = W0[0]; W[1] = W0[1]; W[2] = W0[2]; a = a0; b = b0; c = c0; d = d0; e = e0; f = f0; g = g0; h = h0; /* t = 3 */ W[3] = (vec_uint4) { nonce + 0, nonce + 1, nonce + 2, nonce + 3 }; ROUND(3); # ifdef UNROLL_SHA256 W[ 4] = SPLAT(M.words[ 4]); ROUND( 4); W[ 5] = SPLAT(M.words[ 5]); ROUND( 5); W[ 6] = SPLAT(M.words[ 6]); ROUND( 6); W[ 7] = SPLAT(M.words[ 7]); ROUND( 7); W[ 8] = SPLAT(M.words[ 8]); ROUND( 8); W[ 9] = SPLAT(M.words[ 9]); ROUND( 9); W[10] = SPLAT(M.words[10]); ROUND(10); W[11] = SPLAT(M.words[11]); ROUND(11); W[12] = SPLAT(M.words[12]); ROUND(12); W[13] = SPLAT(M.words[13]); ROUND(13); W[14] = SPLAT(M.words[14]); ROUND(14); W[15] = SPLAT(M.words[15]); ROUND(15); # else for (t = 4; t < 16; ++t) { W[t] = SPLAT(M.words[t]); ROUND(t); } # endif # ifdef UNROLL_SHA256 W[16 % 16] = W(16); ROUND(16); W[17 % 16] = W(17); ROUND(17); W[18 % 16] = W(18); ROUND(18); W[19 % 16] = W(19); ROUND(19); W[20 % 16] = W(20); ROUND(20); W[21 % 16] = W(21); ROUND(21); W[22 % 16] = W(22); ROUND(22); W[23 % 16] = W(23); ROUND(23); W[24 % 16] = W(24); ROUND(24); W[25 % 16] = W(25); ROUND(25); W[26 % 16] = W(26); ROUND(26); W[27 % 16] = W(27); ROUND(27); W[28 % 16] = W(28); ROUND(28); W[29 % 16] = W(29); ROUND(29); W[30 % 16] = W(30); ROUND(30); W[31 % 16] = W(31); ROUND(31); W[32 % 16] = W(32); ROUND(32); W[33 % 16] = W(33); ROUND(33); W[34 % 16] = W(34); ROUND(34); W[35 % 16] = W(35); ROUND(35); W[36 % 16] = W(36); ROUND(36); W[37 % 16] = W(37); ROUND(37); W[38 % 16] = W(38); ROUND(38); W[39 % 16] = W(39); ROUND(39); W[40 % 16] = W(40); ROUND(40); W[41 % 16] = W(41); ROUND(41); W[42 % 16] = W(42); ROUND(42); W[43 % 16] = W(43); ROUND(43); W[44 % 16] = W(44); ROUND(44); W[45 % 16] = W(45); ROUND(45); W[46 % 16] = W(46); ROUND(46); W[47 % 16] = W(47); ROUND(47); W[48 % 16] = W(48); ROUND(48); W[49 % 16] = W(49); ROUND(49); W[50 % 16] = W(50); ROUND(50); W[51 % 16] = W(51); ROUND(51); W[52 % 16] = W(52); ROUND(52); W[53 % 16] = W(53); ROUND(53); W[54 % 16] = W(54); ROUND(54); W[55 % 16] = W(55); ROUND(55); W[56 % 16] = W(56); ROUND(56); W[57 % 16] = W(57); ROUND(57); W[58 % 16] = W(58); ROUND(58); W[59 % 16] = W(59); ROUND(59); W[60 % 16] = W(60); ROUND(60); W[61 % 16] = W(61); ROUND(61); W[62 % 16] = W(62); ROUND(62); W[63 % 16] = W(63); ROUND(63); # else for (t = 16; t < 64; ++t) { W[t % 16] = W(t); ROUND(t); } # endif W[0] = ADD(a, midstate.words[0]); W[1] = ADD(b, midstate.words[1]); W[2] = ADD(c, midstate.words[2]); W[3] = ADD(d, midstate.words[3]); W[4] = ADD(e, midstate.words[4]); W[5] = ADD(f, midstate.words[5]); W[6] = ADD(g, midstate.words[6]); W[7] = ADD(h, midstate.words[7]); /* first SHA-256 complete */ a = SPLAT(H0.words[0]); b = SPLAT(H0.words[1]); c = SPLAT(H0.words[2]); d = SPLAT(H0.words[3]); e = SPLAT(H0.words[4]); f = SPLAT(H0.words[5]); g = SPLAT(H0.words[6]); h = SPLAT(H0.words[7]); ROUND(0); ROUND(1); ROUND(2); ROUND(3); ROUND(4); ROUND(5); ROUND(6); ROUND(7); W[ 8] = SPLAT(0x80000000U); ROUND( 8); # ifdef UNROLL_SHA256 W[ 9] = SPLAT(0x00000000U); ROUND( 9); W[10] = SPLAT(0x00000000U); ROUND(10); W[11] = SPLAT(0x00000000U); ROUND(11); W[12] = SPLAT(0x00000000U); ROUND(12); W[13] = SPLAT(0x00000000U); ROUND(13); W[14] = SPLAT(0x00000000U); ROUND(14); # else for (t = 9; t < 15; ++t) { W[t] = SPLAT(0U); ROUND(t); } # endif W[15] = SPLAT(0x00000100U); ROUND(15); # ifdef UNROLL_SHA256 W[16 % 16] = W(16); ROUND(16); W[17 % 16] = W(17); ROUND(17); W[18 % 16] = W(18); ROUND(18); W[19 % 16] = W(19); ROUND(19); W[20 % 16] = W(20); ROUND(20); W[21 % 16] = W(21); ROUND(21); W[22 % 16] = W(22); ROUND(22); W[23 % 16] = W(23); ROUND(23); W[24 % 16] = W(24); ROUND(24); W[25 % 16] = W(25); ROUND(25); W[26 % 16] = W(26); ROUND(26); W[27 % 16] = W(27); ROUND(27); W[28 % 16] = W(28); ROUND(28); W[29 % 16] = W(29); ROUND(29); W[30 % 16] = W(30); ROUND(30); W[31 % 16] = W(31); ROUND(31); W[32 % 16] = W(32); ROUND(32); W[33 % 16] = W(33); ROUND(33); W[34 % 16] = W(34); ROUND(34); W[35 % 16] = W(35); ROUND(35); W[36 % 16] = W(36); ROUND(36); W[37 % 16] = W(37); ROUND(37); W[38 % 16] = W(38); ROUND(38); W[39 % 16] = W(39); ROUND(39); W[40 % 16] = W(40); ROUND(40); W[41 % 16] = W(41); ROUND(41); W[42 % 16] = W(42); ROUND(42); W[43 % 16] = W(43); ROUND(43); W[44 % 16] = W(44); ROUND(44); W[45 % 16] = W(45); ROUND(45); W[46 % 16] = W(46); ROUND(46); W[47 % 16] = W(47); ROUND(47); W[48 % 16] = W(48); ROUND(48); W[49 % 16] = W(49); ROUND(49); W[50 % 16] = W(50); ROUND(50); W[51 % 16] = W(51); ROUND(51); W[52 % 16] = W(52); ROUND(52); W[53 % 16] = W(53); ROUND(53); W[54 % 16] = W(54); ROUND(54); W[55 % 16] = W(55); ROUND(55); W[56 % 16] = W(56); ROUND(56); W[57 % 16] = W(57); ROUND(57); W[58 % 16] = W(58); ROUND(58); W[59 % 16] = W(59); ROUND(59); /* t = 60..63 delayed */ # else for (t = 16; t < 60; ++t) { W[t % 16] = W(t); ROUND(t); } # endif W[60 % 16] = W(60); T1 = T1(60, e, f, g, h); T2 = ADD(ADD(d, T1), H0.words[7]); /* quick check to see if any element of the last word vector is zero */ if (__builtin_expect(spu_extract(spu_gather(spu_cmpeq(T2, 0)), 0) == 0, 1)) continue; /* we have something interesting; finish the SHA-256 */ ROUND(60); # ifdef UNROLL_SHA256 W[61 % 16] = W(61); ROUND(61); W[62 % 16] = W(62); ROUND(62); W[63 % 16] = W(63); ROUND(63); # else for (t = 61; t < 64; ++t) { W[t % 16] = W(t); ROUND(t); } # endif a = ADD(a, H0.words[0]); b = ADD(b, H0.words[1]); c = ADD(c, H0.words[2]); d = ADD(d, H0.words[3]); e = ADD(e, H0.words[4]); f = ADD(f, H0.words[5]); g = ADD(g, H0.words[6]); h = ADD(h, H0.words[7]); /* now do the full (reversed-endian) subtraction */ borrow = spu_genb(SPLAT(target.words[7]), spu_shuffle(a, a, reverse_endian)); borrow = spu_genbx(SPLAT(target.words[6]), spu_shuffle(b, b, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[5]), spu_shuffle(c, c, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[4]), spu_shuffle(d, d, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[3]), spu_shuffle(e, e, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[2]), spu_shuffle(f, f, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[1]), spu_shuffle(g, g, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[0]), spu_shuffle(h, h, reverse_endian), borrow); solution = spu_gather(borrow); if (__builtin_expect(spu_extract(solution, 0) == 0, 1)) continue; /* we have a winner */ return nonce + (spu_extract(spu_cntlz(solution), 0) - 28); } return -1; }
void process_buffer(int buffer, int cnt, vector float dt_v) { int i; volatile vector float *p_inv_mass_v; vector float force_v, inv_mass_v; vector float pos0, pos1, pos2, pos3; vector float vel0, vel1, vel2, vel3; vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3; vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7}; vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11}; vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15}; p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; force_v = ctx.force_v; // Compute the step in time for the block of particles, four // particle at a time. for (i=0; i<cnt; i+=4) { inv_mass_v = *p_inv_mass_v++; pos0 = pos[buffer][i+0]; pos1 = pos[buffer][i+1]; pos2 = pos[buffer][i+2]; pos3 = pos[buffer][i+3]; vel0 = vel[buffer][i+0]; vel1 = vel[buffer][i+1]; vel2 = vel[buffer][i+2]; vel3 = vel[buffer][i+3]; dt_inv_mass_v = spu_mul(dt_v, inv_mass_v); pos0 = spu_madd(vel0, dt_v, pos0); pos1 = spu_madd(vel1, dt_v, pos1); pos2 = spu_madd(vel2, dt_v, pos2); pos3 = spu_madd(vel3, dt_v, pos3); dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0); dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1); dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2); dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3); vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0); vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1); vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2); vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3); pos[buffer][i+0] = pos0; pos[buffer][i+1] = pos1; pos[buffer][i+2] = pos2; pos[buffer][i+3] = pos3; vel[buffer][i+0] = vel0; vel[buffer][i+1] = vel1; vel[buffer][i+2] = vel2; vel[buffer][i+3] = vel3; } } int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv) { int buffer, next_buffer; int cnt, next_cnt, left; float time, dt; vector float dt_v; volatile vector float *ctx_pos_v, *ctx_vel_v; volatile vector float *next_ctx_pos_v, *next_ctx_vel_v; volatile float *ctx_inv_mass, *next_ctx_inv_mass; unsigned int tags[2]; // Reserve a pair of DMA tag IDs tags[0] = mfc_tag_reserve(); tags[1] = mfc_tag_reserve(); // Input parameter argv is a pointer to the particle context. // Fetch the parameter context, waiting for it to complete. spu_writech(MFC_WrTagMask, 1 << tags[0]); spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); dt = ctx.dt; dt_v = spu_splats(dt); // For each step in time for (time=0; time<END_OF_TIME; time += dt) { // For each double buffered block of particles left = ctx.particles; cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; ctx_pos_v = ctx.pos_v; ctx_vel_v = ctx.vel_v; ctx_inv_mass = ctx.inv_mass; // Prefetch first buffer of input data. buffer = 0; spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD); spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD); while (cnt < left) { left -= cnt; next_ctx_pos_v = ctx_pos_v + cnt; next_ctx_vel_v = ctx_vel_v + cnt; next_ctx_inv_mass = ctx_inv_mass + cnt; next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; // Prefetch next buffer so the data is available for computation on next loop iteration. // The first DMA is barriered so that we don't GET data before the previous iteration's // data is PUT. next_buffer = buffer^1; spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD); spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD); spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD); // Wait for previously prefetched data spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); process_buffer(buffer, cnt, dt_v); // Put the buffer's position and velocity data back into system memory spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); ctx_pos_v = next_ctx_pos_v; ctx_vel_v = next_ctx_vel_v; ctx_inv_mass = next_ctx_inv_mass; buffer = next_buffer; cnt = next_cnt; } // Wait for previously prefetched data spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); process_buffer(buffer, cnt, dt_v); // Put the buffer's position and velocity data back into system memory spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); // Wait for DMAs to complete before starting the next step in time. spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); } return (0); }
/* Scans the string pointed to by s for the character c and * returns a pointer to the last occurance of c. If * c is not found, then NULL is returned. */ char * strrchr(const char *s, int c) { int nskip; vec_uchar16 *ptr, data, vc; vec_uint4 cmp_c, cmp_0, cmp; vec_uint4 res_ptr, res_cmp; vec_uint4 mask, result; vec_uint4 one = spu_splats(0xffffU); /* Scan memory array a quadword at a time. Skip leading * mis-aligned bytes. */ ptr = (vec_uchar16 *)s; nskip = -((unsigned int)(ptr) & 15); mask = spu_rlmask(one, nskip); vc = spu_splats((unsigned char)(c)); data = *ptr++; ptr = (vec_uchar16 *)((unsigned int)ptr & ~15); cmp_c = spu_and(spu_gather(spu_cmpeq(data, vc)), mask); cmp_0 = spu_and(spu_gather(spu_cmpeq(data, 0)), mask); res_ptr = spu_splats(0U); res_cmp = spu_splats(0U); while (spu_extract(cmp_0, 0) == 0) { cmp = spu_cmpeq(cmp_c, 0); res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp); res_cmp = spu_sel(cmp_c, res_cmp, cmp); data = *ptr++; cmp_c = spu_gather(spu_cmpeq(data, vc)); cmp_0 = spu_gather(spu_cmpeq(data, 0)); cmp = spu_cmpeq(cmp_c, 0); } /* Compute the location of the last character before termination * character. * * First mask off compare results following the first termination character. */ mask = spu_sl(one, 31 - spu_extract(spu_cntlz(cmp_0), 0)); cmp_c = spu_and(cmp_c, mask); /* Conditionally update res_ptr and res_cmd if a match was found in the last * quadword. */ cmp = spu_cmpeq(cmp_c, 0); res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp); res_cmp = spu_sel(cmp_c, res_cmp, cmp); /* Bit reserve res_cmp for locating last occurance. */ mask = spu_cmpeq(res_cmp, 0); res_cmp = (vec_uint4)spu_maskb(spu_extract(res_cmp, 0)); res_cmp = spu_gather((vec_uchar16)spu_shuffle(res_cmp, res_cmp, VEC_LITERAL(vec_uchar16, 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0))); /* Compute the location (ptr) of the last occurance of c. If no * occurance was found (ie, element 0 of res_cmp == 0, then return * NULL. */ result = spu_sub(spu_add(res_ptr, 15), spu_cntlz(res_cmp)); result = spu_andc(result, mask); return ((char *)spu_extract(result, 0)); }
void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){ //loop iterator i int i = 0; void* retval = target; //put the target and source addresses into qwords vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; //create shuffle masks //shuffle mask building blocks: //all from the first vector vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; //all from the second vector vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); vector unsigned char cmp_res = spu_or(gt_res, eq_res); vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); //eta: second half of the second, first half of the first, break at (unsigned int)src1%16 src_cmp = spu_splats((unsigned char)((unsigned int)src1%16)); gt_res = spu_cmpgt(oneup, src_cmp); eq_res = spu_cmpeq(oneup, src_cmp); cmp_res = spu_or(gt_res, eq_res); sixteen_uchar = spu_splats((unsigned char)16); phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); //alpha: first half of first, second half of second, break at (unsigned int)target%16 src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); gt_res = spu_cmpgt(oneup, src_cmp); eq_res = spu_cmpeq(oneup, src_cmp); cmp_res = spu_or(gt_res, eq_res); phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); //delta: first half of first, first half of second, break at (unsigned int)target%16 vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); //epsilon: second half of second, second half of first, break at (unsigned int)target%16 vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); //beta: first half of first, second half of second, break at num_bytes%16 src_cmp = spu_splats((unsigned char)(num_bytes%16)); gt_res = spu_cmpgt(oneup, src_cmp); eq_res = spu_cmpeq(oneup, src_cmp); cmp_res = spu_or(gt_res, eq_res); phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); qword src0_past; qword src0_present; qword src1_past; qword src1_present; qword tgt_past; qword tgt_present; qword in_temp0; qword in_temp1; qword out_temp0; qword out_temp1; src0_past = si_lqd((qword)address_counter_src0, 0); src1_past = si_lqd((qword)address_counter_src1, 0); tgt_past = si_lqd((qword)address_counter_tgt, 0); vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b}; vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b}; vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f}; vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00}; vector float prod0; qword shuf0; vector float prod1; vector float sign_change; qword summand0; qword summand1; vector float sum; for(i = 0; i < num_bytes/16; ++i) { src0_present = si_lqd((qword)address_counter_src0, 16); src1_present = si_lqd((qword)address_counter_src1, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1); shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0); prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0); sign_change = spu_xor(prod0, (vector float)sign_changer); summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1); summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2); sum = spu_add((vector float)summand0, (vector float)summand1); out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); tgt_past = out_temp1; src0_past = src0_present; src1_past = src1_present; address_counter_src0 = spu_add(address_counter_src0, 16); address_counter_src1 = spu_add(address_counter_src1, 16); address_counter_tgt = spu_add(address_counter_tgt, 16); } src0_present = si_lqd((qword)address_counter_src0, 16); src1_present = si_lqd((qword)address_counter_src1, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1); shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0); prod1 = spu_mul(prod0, (vector float)shuf0); sign_change = spu_xor(prod0, (vector float)sign_changer); summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1); summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2); sum = spu_add((vector float)summand0, (vector float)summand1); qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); return retval; }
void merge_buffers(){ vector unsigned int cmp_v, cmp_v2; const vector signed int one_at_0 = {1,0,0,0}; const vector signed int one_at_1 = {0,1,0,0}; const vector signed int one_at_2 = {0,0,1,0}; const vector signed int ones = {1,1,1,1}; const vector signed int zeros = {0,0,0,0}; const vector unsigned char cmp_v_shuffle_mask = {31,31,31,31, 31,31,31,31, 31,31,31,31, 31,31,31,31}; vector unsigned char rev_mask; const vector unsigned char rev_left = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; const vector unsigned char rev_right = {28,29,30,31, 24,25,26,27, 20,21,22,23, 16,17,18,19}; vector signed int *out_head_idx; if(mcb[am].local[OUT] < 255){ int parent_idx = mcb[am].local[OUT]; int side = (mcb[am].id+1)&1; out_head_idx = (vector signed int*) &md[parent_idx].idx[side][HEAD]; } else { out_head_idx = (vector signed int*) &md[am].idx[OUT][HEAD]; } vector signed int *left_tail_idx = (vector signed int*) &md[am].idx[LEFT][TAIL]; vector signed int *right_tail_idx = (vector signed int*) &md[am].idx[RIGHT][TAIL]; vector signed int size_v = {mcb[am].buffer_size[LEFT], mcb[am].buffer_size[RIGHT], mcb[am].buffer_size[OUT], 0}; vector signed int avail_v = {num_in_buffer(LEFT), num_in_buffer(RIGHT), num_free_in_buffer(OUT), 1}; vector signed int avail_before = { spu_extract(avail_v, 0), spu_extract(avail_v, 1), 0, 0 }; vector unsigned int avail = spu_gather( spu_cmpgt(avail_v, zeros) ); // avail = 0x0F if all avail_v > zeros vector signed int *left, *right, *out; left = (vector signed int*) &md[am].buffer[LEFT][ spu_extract(*left_tail_idx,0) ]; right = (vector signed int*) &md[am].buffer[RIGHT][ spu_extract(*right_tail_idx,0) ]; out = (vector signed int*) &md[am].buffer[OUT][ spu_extract(*out_head_idx,0) ]; #ifdef TRACE_TIME dec_val2 = spu_read_decrementer(); #endif while(spu_extract(avail,0) == 0x0F){ // cmp left and right to determine who gets eaten cmp_v = spu_cmpgt(*left,*right); cmp_v = spu_shuffle(cmp_v, cmp_v, cmp_v_shuffle_mask); // cmp_v = {FFFF,FFFF,FFFF,FFFF} if left[3] > right[3] *out = spu_sel(*left,*right,cmp_v); rev_mask = spu_sel(rev_right,rev_left,(vector unsigned char)cmp_v); *left = spu_shuffle(*left,*right,rev_mask); // data to be sorted is now in out and left, left in descending order sort_vectors(out,left); // update index of the used side if( spu_extract(cmp_v,0) ){ // left[3] > right[3] *right_tail_idx = spu_add(*right_tail_idx,ones); avail_v = spu_sub(avail_v, one_at_1); right++; // modulus hack cmp_v2 = spu_cmpeq(*right_tail_idx, size_v); if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){ *right_tail_idx = zeros; right = (vector signed int*) &md[am].buffer[RIGHT][0]; } } else { *right = *left; *left_tail_idx = spu_add(*left_tail_idx,ones); avail_v = spu_sub(avail_v, one_at_0); left++; // modulus hack cmp_v2 = spu_cmpeq(*left_tail_idx, size_v); if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){ *left_tail_idx = zeros; left = (vector signed int*) &md[am].buffer[LEFT][0]; } } // update out head idx *out_head_idx = spu_add(*out_head_idx,ones); avail_v = spu_sub(avail_v, one_at_2); out++; // modulus hack cmp_v2 = spu_cmpeq(*out_head_idx, size_v); if( __builtin_expect(spu_extract(cmp_v2,0),0) ){ out = (vector signed int*) &md[am].buffer[OUT][0]; *out_head_idx = zeros; } // is there data still available? avail = spu_gather(spu_cmpgt(avail_v, zeros)); } #ifdef TRACE_TIME merge_loop_ticks += -(spu_read_decrementer() - dec_val2); #endif // how much got produced? vector signed int consumed = spu_sub(avail_before, avail_v); int consumed_left = spu_extract(consumed, 0); int consumed_right = spu_extract(consumed, 1); if(consumed_left) update_tail(LEFT); if(consumed_right) update_tail(RIGHT); md[am].consumed[LEFT] += consumed_left; md[am].consumed[RIGHT] += consumed_right; if(md[am].consumed[LEFT] == mcb[am].data_size[LEFT]) md[am].depleted[LEFT] = 1; if(md[am].consumed[RIGHT] == mcb[am].data_size[RIGHT]) md[am].depleted[RIGHT] = 1; if(mcb[am].local[OUT] < 255 && md[am].depleted[LEFT] && md[am].depleted[RIGHT]){ md[am].done = 1; --num_active_mergers; } }