inline void MinMaxBinCount3SIMD(aabb_t *aabb, minmaxbin_t *mmb, aabb_t *baabb) { vector float *baabb_min = (vector float*)baabb->min; vector float *baabb_max = (vector float*)baabb->max; vector float *aabb_min = (vector float*)aabb->min; vector float *aabb_max = (vector float*)aabb->max; vector float nbins = spu_splats((float)nsamplepoints); vector float invnbins = spu_re(nbins); nbins = spu_sub(nbins, spu_splats(1.0f)); vector float width = spu_abs(spu_sub(*baabb_max, *baabb_min)); vector float invdelta = spu_re(spu_mul(width, invnbins)); vector int minindex = GetBinSIMD(*baabb_min, *baabb_max, *aabb_min, invdelta, nbins); vector int maxindex = GetBinSIMD(*baabb_min, *baabb_max, *aabb_max, invdelta, nbins); mmb->minbins[minindex[0]].b[0]++; mmb->minbins[minindex[1]].b[1]++; mmb->minbins[minindex[2]].b[2]++; mmb->maxbins[maxindex[0]].b[0]++; mmb->maxbins[maxindex[1]].b[1]++; mmb->maxbins[maxindex[2]].b[2]++; }
/** * Setup fragment shader inputs by evaluating triangle's vertex * attribute coefficient info. * \param x quad x pos * \param y quad y pos * \param fragZ returns quad Z values * \param fragInputs returns fragment program inputs * Note: this code could be incorporated into the fragment program * itself to avoid the loop and switch. */ static void eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[]) { static const vector float deltaX = (const vector float) {0, 1, 0, 1}; static const vector float deltaY = (const vector float) {0, 0, 1, 1}; const uint posSlot = 0; const vector float pos = setup.coef[posSlot].a0; const vector float dposdx = setup.coef[posSlot].dadx; const vector float dposdy = setup.coef[posSlot].dady; const vector float fragX = spu_splats(x) + deltaX; const vector float fragY = spu_splats(y) + deltaY; vector float fragW, wInv; uint i; *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy); fragW = splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy); wInv = spu_re(fragW); /* 1 / w */ /* loop over fragment program inputs */ for (i = 0; i < spu.vertex_info.num_attribs; i++) { uint attr = i + 1; enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode; /* constant term */ vector float a0 = setup.coef[attr].a0; vector float r0 = splatx(a0); vector float r1 = splaty(a0); vector float r2 = splatz(a0); vector float r3 = splatw(a0); if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) { /* linear term */ vector float dadx = setup.coef[attr].dadx; vector float dady = setup.coef[attr].dady; /* Use SPU intrinsics here to get slightly better code. * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady); */ r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0)); r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1)); r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2)); r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3)); if (interp == INTERP_PERSPECTIVE) { /* perspective term */ r0 *= wInv; r1 *= wInv; r2 *= wInv; r3 *= wInv; } } fragInputs[CHAN0] = r0; fragInputs[CHAN1] = r1; fragInputs[CHAN2] = r2; fragInputs[CHAN3] = r3; fragInputs += 4; } } /** * Emit a quad (pass to next stage). No clipping is done. * Note: about 1/5 to 1/7 of the time, mask is zero and this function * should be skipped. But adding the test for that slows things down * overall. */ static INLINE void emit_quad( int x, int y, mask_t mask) { /* If any bits in mask are set... */ if (spu_extract(spu_orx(mask), 0)) { const int ix = x - setup.cliprect_minx; const int iy = y - setup.cliprect_miny; spu.cur_ctile_status = TILE_STATUS_DIRTY; spu.cur_ztile_status = TILE_STATUS_DIRTY; { /* * Run fragment shader, execute per-fragment ops, update fb/tile. */ vector float inputs[4*4], outputs[2*4]; vector unsigned int kill_mask; vector float fragZ; eval_inputs((float) x, (float) y, &fragZ, inputs); ASSERT(spu.fragment_program); ASSERT(spu.fragment_ops); /* Execute the current fragment program */ kill_mask = spu.fragment_program(inputs, outputs, spu.constants); mask = spu_andc(mask, kill_mask); /* Execute per-fragment/quad operations, including: * alpha test, z test, stencil test, blend and framebuffer writing. * Note that there are two different fragment operations functions * that can be called, one for front-facing fragments, and one * for back-facing fragments. (Often the two are the same; * but in some cases, like two-sided stenciling, they can be * very different.) So choose the correct function depending * on the calculated facing. */ spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile, fragZ, outputs[0*4+0], outputs[0*4+1], outputs[0*4+2], outputs[0*4+3], mask); } } } /** * Given an X or Y coordinate, return the block/quad coordinate that it * belongs to. */ static INLINE int block(int x) { return x & ~1; } /** * Render a horizontal span of quads */ static void flush_spans(void) { int minleft, maxright; const int l0 = spu_extract(setup.span.quad, 0); const int l1 = spu_extract(setup.span.quad, 1); const int r0 = spu_extract(setup.span.quad, 2); const int r1 = spu_extract(setup.span.quad, 3); switch (setup.span.y_flags) { case 0x3: /* both odd and even lines written (both quad rows) */ minleft = MIN2(l0, l1); maxright = MAX2(r0, r1); break; case 0x1: /* only even line written (quad top row) */ minleft = l0; maxright = r0; break; case 0x2: /* only odd line written (quad bottom row) */ minleft = l1; maxright = r1; break; default: return; } /* OK, we're very likely to need the tile data now. * clear or finish waiting if needed. */ if (spu.cur_ctile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ctile\n", spu.init.id); wait_on_mask(1 << TAG_READ_TILE_COLOR); spu.cur_ctile_status = TILE_STATUS_CLEAN; } else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) { //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty); clear_c_tile(&spu.ctile); spu.cur_ctile_status = TILE_STATUS_DIRTY; } ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); if (spu.read_depth_stencil) { if (spu.cur_ztile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ztile\n", spu.init.id); wait_on_mask(1 << TAG_READ_TILE_Z); spu.cur_ztile_status = TILE_STATUS_CLEAN; } else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) { //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty); clear_z_tile(&spu.ztile); spu.cur_ztile_status = TILE_STATUS_DIRTY; } ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED); } /* XXX this loop could be moved into the above switch cases... */ /* Setup for mask calculation */ const vec_int4 quad_LlRr = setup.span.quad; const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8); const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B)); const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B)); const vec_int4 twos = spu_splats(2); const int x = block(minleft); vec_int4 xs = {x, x+1, x, x+1}; for (; spu_extract(xs, 0) <= block(maxright); xs += twos) { /** * Computes mask to indicate which pixels in the 2x2 quad are actually * inside the triangle's bounds. */ /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */ const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs); const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */ const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs); /* Combine results to create mask */ const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs); emit_quad(spu_extract(xs, 0), setup.span.y, mask); } setup.span.y = 0; setup.span.y_flags = 0; /* Zero right elements */ setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0)); } #if DEBUG_VERTS static void print_vertex(const struct vertex_header *v) { uint i; fprintf(stderr, " Vertex: (%p)\n", v); for (i = 0; i < spu.vertex_info.num_attribs; i++) { fprintf(stderr, " %d: %f %f %f %f\n", i, spu_extract(v->data[i], 0), spu_extract(v->data[i], 1), spu_extract(v->data[i], 2), spu_extract(v->data[i], 3)); } }
void draw_frame(uint64_t buf_ea) { vec_uint4 buf[2*1920/4]; int row, col, i, tag = 0; float step = 4.0f/spu.width*spu.zoom; float xbeg = spu.xc - spu.width*step*0.5f; vec_float4 vxbeg = spu_splats(xbeg) + spu_splats(step) * (vec_float4) { 0.f,1.f,2.f,3.f }; vec_float4 xstep = spu_splats(step)*spu_splats(4.f); vec_float4 vyp = spu_splats(spu.yc - spu.height*step*0.5f + step*spu.rank); const vec_float4 vinc = spu_splats(spu.count * step); const vec_float4 esc2 = spu_splats(BAILOUT*BAILOUT); #if BAILBITS != 1 const vec_float4 esc21 = spu_splats(4.f/(BAILOUT*BAILOUT)); #endif const vec_float4 two = spu_splats(2.f); const vec_float4 zero = spu_splats(0.f); const vec_float4 colsc = spu_splats(255.f); const vec_float4 ccr = spu_splats(4.f*BAILOUT/(3.5f*3.141592654f)); const vec_float4 ccg = spu_splats(4.f*BAILOUT/(5.f*3.141592654f)); const vec_float4 ccb = spu_splats(4.f*BAILOUT/(9.f*3.141592654f)); vec_float4 x, y, x2, y2, m2, vxp; vec_uint4 cmp, inc; vec_uint4 vi; vec_uint4 *p, *b; vec_float4 co; /* Process the full image. As there are 6 SPUs working in parallel, each with * a different rank from 0 to 5, each SPU processes only the line numbers: * rank, rank+6, rank+12, ... * The program uses a SPU DMA programming technique known as "double buffering", * where the previously generated line is transmitted to main memory while we * compute the next one, hence the need for a local buffer containing two lines. */ for (row = spu.rank; row < spu.height; row += spu.count) { /* Pixel buffer address (in local memory) of the next line to be drawn */ b = p = buf + ((1920/4)&-tag); vxp = vxbeg; /* first four x coordinates */ /* Process a whole screen line by packets of 4 pixels */ for (col = spu.width/4; col > 0 ; col--) { vi = spu_splats(0u); x = vxp; y = vyp; i = 0; cmp = spu_splats(-1u); inc = spu_splats(1u); m2 = zero; /* This loop processes the Mandelbrot suite for the four complex numbers * whose real part are the components of the x vector, and the imaginary * part are in y (as we process the same line, all initial values of y * are equal). * We perform loop unrolling for SPU performance optimization reasons, * hence the 4x replication of the same computation block. */ do { x2 = x*x; y2 = y*y; m2 = spu_sel(m2, x2+y2, cmp); cmp = spu_cmpgt(esc2, m2); inc = spu_and(inc, cmp); /* increment the iteration count only if */ vi = vi + inc; /* we're still inside the bailout radius */ y = two*x*y + vyp; x = x2-y2 + vxp; x2 = x*x; y2 = y*y; m2 = spu_sel(m2, x2+y2, cmp); cmp = spu_cmpgt(esc2, m2); inc = spu_and(inc, cmp); vi = vi + inc; y = two*x*y + vyp; x = x2-y2 + vxp; x2 = x*x; y2 = y*y; m2 = spu_sel(m2, x2+y2, cmp); cmp = spu_cmpgt(esc2, m2); inc = spu_and(inc, cmp); vi = vi + inc; y = two*x*y + vyp; x = x2-y2 + vxp; x2 = x*x; y2 = y*y; m2 = spu_sel(m2, x2+y2, cmp); cmp = spu_cmpgt(esc2, m2); inc = spu_and(inc, cmp); vi = vi + inc; y = two*x*y + vyp; x = x2-y2 + vxp; i += 4; } /* Exit the loop only if the iteration limit of 128 has been reached, * or all current four points are outside the bailout radius. * The __builtin_expect(xxx, 1) construct hints the compiler that the xxx * test has greater chance of being true (1), so a branch hinting * instruction is inserted into the binary code to make the conditional * branch faster in most cases (except the last one when we exit the * loop). This results in performance increase. */ while (__builtin_expect((i < 128) & (si_to_int((qword)spu_gather(cmp)) != 0), 1)); /* smooth coloring: compute the fractional part */ co = spu_convtf(vi, 0) + spu_splats(1.f); co -= fast_logf(fast_logf(m2) * spu_splats(.5f)); #if BAILBITS != 1 co = spu_re(spu_rsqrte(co*esc21)); #endif /* Compute the red, green an blue pixel components */ vec_uint4 cr = spu_convtu(mcos(co * ccr) * colsc, 0); vec_uint4 cg = spu_convtu(mcos(co * ccg) * colsc, 0); vec_uint4 cb = spu_convtu(mcos(co * ccb) * colsc, 0); /* Put the 4 pixel values in the buffer */ *p++ = (spu_sl(cr, 16) | spu_sl(cg, 8) | cb) & ~-inc; vxp += xstep; } /* double-buffered dma: initiate a dma transfer of last computed scanline * then wait for completion of the second last transfer (previous computed * line). This is done by changing the tag value. */ mfc_put(b, buf_ea+(spu.width*4)*row, spu.width*4, tag, 0, 0); tag = 1 - tag; wait_for_completion(tag); vyp += vinc; } /* wait for completion of last sent image line */ wait_for_completion(1-tag); }
vector double __divv2df3 (vector double a_in, vector double b_in) { /* Variables */ vec_int4 exp, exp_bias; vec_uint4 no_underflow, overflow; vec_float4 mant_bf, inv_bf; vec_ullong2 exp_a, exp_b; vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0; vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0; vec_ullong2 nan; vec_uint4 a_exp, b_exp; vec_ullong2 a_mant_0, b_mant_0; vec_ullong2 a_exp_1s, b_exp_1s; vec_ullong2 sign_exp_mask; vec_double2 a, b; vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult; /* Constants */ vec_uint4 exp_mask_u32 = spu_splats((unsigned int)0x7FF00000); vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8, 9,10,11, 8,9,10,11 }; vec_uchar16 swap_32 = (vec_uchar16) { 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL); vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL); vec_float4 onef = spu_splats(1.0f); vec_double2 one = spu_splats(1.0); vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL); sign_exp_mask = spu_or(sign_mask, exp_mask); /* Extract the floating point components from each of the operands including * exponent and mantissa. */ a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32); a_exp = spu_shuffle(a_exp, a_exp, splat_hi); b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32); b_exp = spu_shuffle(b_exp, b_exp, splat_hi); a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0); a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32)); b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0); b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32)); a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32); b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32); /* Identify all possible special values that must be accommodated including: * +-denorm, +-0, +-infinity, and NaNs. */ a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0); a_nan = spu_andc(a_exp_1s, a_mant_0); a_zero = spu_and (a_denorm0, a_mant_0); a_inf = spu_and (a_exp_1s, a_mant_0); a_denorm = spu_andc(a_denorm0, a_zero); b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0); b_nan = spu_andc(b_exp_1s, b_mant_0); b_zero = spu_and (b_denorm0, b_mant_0); b_inf = spu_and (b_exp_1s, b_mant_0); b_denorm = spu_andc(b_denorm0, b_zero); /* Scale denorm inputs to into normalized numbers by conditionally scaling the * input parameters. */ a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask)); a = spu_sel(a_in, a, a_denorm); b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask)); b = spu_sel(b_in, b, b_denorm); /* Extract the divisor and dividend exponent and force parameters into the signed * range [1.0,2.0) or [-1.0,2.0). */ exp_a = spu_and((vec_ullong2)a, exp_mask); exp_b = spu_and((vec_ullong2)b, exp_mask); mant_a = spu_sel(a, one, (vec_ullong2)exp_mask); mant_b = spu_sel(b, one, (vec_ullong2)exp_mask); /* Approximate the single reciprocal of b by using * the single precision reciprocal estimate followed by one * single precision iteration of Newton-Raphson. */ mant_bf = spu_roundtf(mant_b); inv_bf = spu_re(mant_bf); inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf); /* Perform 2 more Newton-Raphson iterations in double precision. The * result (q1) is in the range (0.5, 2.0). */ inv_b = spu_extend(inv_bf); inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b); q0 = spu_mul(mant_a, inv_b); q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0); /* Determine the exponent correction factor that must be applied * to q1 by taking into account the exponent of the normalized inputs * and the scale factors that were applied to normalize them. */ exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20); exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34))); /* Bias the quotient exponent depending on the sign of the exponent correction * factor so that a single multiplier will ensure the entire double precision * domain (including denorms) can be achieved. * * exp bias q1 adjust exp * ===== ======== ========== * positive 2^+65 -65 * negative 2^-64 +64 */ exp_bias = spu_xor(spu_rlmaska(exp, -31), 64); exp = spu_sub(exp, exp_bias); q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask); /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the * expected result. On overflow, clamp the multiplier to the maximum non-infinite * number in case the rounding mode is not round-to-nearest. */ exp = spu_add(exp, 0x3FF); no_underflow = spu_cmpgt(exp, 0); overflow = spu_cmpgt(exp, 0x7FE); exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow); exp = spu_and(exp, (vec_int4)exp_mask); mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow); /* Handle special value conditions. These include: * * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN * results. * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results. * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results. */ mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf)); mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero)); nan = spu_or(a_nan, b_nan); nan = spu_or(nan, spu_and(a_zero, b_zero)); nan = spu_or(nan, spu_and(a_inf, b_inf)); mult = spu_or(mult, (vec_double2)nan); /* Scale the final quotient */ q2 = spu_mul(q1, mult); return (q2); }