static inline vec_float4 vec_dot3( vec_float4 vec0, vec_float4 vec1 ) { vec_float4 result; result = spu_mul( vec0, vec1 ); result = spu_madd( spu_rlqwbyte( vec0, 4 ), spu_rlqwbyte( vec1, 4 ), result ); return spu_madd( spu_rlqwbyte( vec0, 8 ), spu_rlqwbyte( vec1, 8 ), result ); }
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm) { int i, j; int left, cnt; float time; unsigned int tag_id; vector float dt_v, dt_inv_mass_v; // Reserve a tag ID tag_id = mfc_tag_reserve(); spu_writech(MFC_WrTagMask, -1); // Input parameter parm is a pointer to the particle parameter context. // Fetch the context, waiting for it to complete. spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); dt_v = spu_splats(ctx.dt); // For each step in time for (time=0; time<END_OF_TIME; time += ctx.dt) { // For each block of particles for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) { // Determine the number of particles in this block. left = ctx.particles - i; cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; // Fetch the data - position, velocity and inverse_mass. Wait for the DMA to complete // before performing computation. spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_GETB_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD); spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * sizeof(float), tag_id, MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); // Compute the step in time for the block of particles for (j=0; j<cnt; j++) { pos[j] = spu_madd(vel[j], dt_v, pos[j]); dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j])); vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]); } // Put the position and velocity data back into system memory spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD); } } // Wait for final DMAs to complete before terminating SPU thread. (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); return (0); }
void triad() { int i, j, n; vector float s = spu_splats(args.scalar); n = SIZE * sizeof(float); for (i = 0; (i + SIZE) < args.N; i += SIZE) { mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0); mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); for (j = 0; j < (SIZE / 4); ++j) ls3[j] = spu_madd(s, ls2[j], ls1[j]); mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0); } mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); if (unlikely(i < args.N)) { /* * args.N - i will be smaller than SIZE at this point so * it is safe to do a DMA transfer. * We need to make sure that size is a multiple of 16. */ n = ((args.N - i) * sizeof(float)) & (~127); mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0); mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); /* n must be divisible by 4. */ for (j = 0; j < ((args.N - i) / 4); ++j) ls3[j] = spu_madd(s, ls2[j], ls1[j]); mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); } /* * At this point it may be that i is still smaller than args.N if the length * was not divisible by the number of SPUs times 16. */ }
int kernel(lwp_functions* pf, void* params, void* inout, unsigned int iter, unsigned int n) { Ternary_params* p = (Ternary_params*)params; switch (p->cmd) { case AM: { int length = p->length / 4; vector float *a = (vector float *)inout; vector float *b = a + length; vector float *c = a + 2 * length; unsigned int i; for (i = 0; i != length; ++i, ++a, ++b, ++c) *a = spu_mul(spu_add(*a, *b), *c); return 0; } case MA: { int length = p->length / 4; vector float *a = (vector float *)inout; vector float *b = a + length; vector float *c = a + 2 * length; unsigned int i; for (i = 0; i != length; ++i, ++a, ++b, ++c) *a = spu_madd(*a, *b, *c); return 0; } case CAM: { static vector unsigned char lo = (vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; static vector unsigned char hi = (vector unsigned char) { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; int length = p->length / 4; float *a = (float *)inout; float *b = a + 8 * length; float *c = a + 16 * length; unsigned int i; // (a + b) * c: // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r for (i = 0; i != length; ++i, a+=8, b+=8, c+=8) { vector float av = {*a, *(a+2), *(a+4), *(a+6)}; // a.r vector float bv = {*b, *(b+2), *(b+4), *(b+6)}; // b.r vector float cv = {*c, *(c+2), *(c+4), *(c+6)}; // c.r vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)}; // a.i vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)}; // b.i vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)}; // c.i vector float trv = spu_add(av, bv); // a.r+b.r vector float tiv = spu_add(dv, ev); // a.i+b.i vector float sv = spu_mul(trv, cv); // (a.r+b.r)*c.r vector float tv = spu_mul(trv, fv); // (a.r+b.r)*c.i vector float real = spu_nmsub(tiv, fv, sv); // r.r vector float imag = spu_madd(tiv, cv, tv); // r.i // interleave result *(vector float *)a = spu_shuffle(real, imag, lo); *(vector float *)(a+4) = spu_shuffle(real, imag, hi); } return 0; } case CMA: { static vector unsigned char lo = (vector unsigned char) { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; static vector unsigned char hi = (vector unsigned char) { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; int length = p->length / 4; float *a = (float *)inout; float *b = a + 8 * length; float *c = a + 16 * length; unsigned int i; // a * b + c: // r.r = a.r*b.r + c.r - a.i*b.i // r.i = a.r*b.i + c.i + a.i*b.r for (i = 0; i != length; ++i, a+=8, b+=8, c+=8) { vector float av = {*a, *(a+2), *(a+4), *(a+6)}; // a.r vector float bv = {*b, *(b+2), *(b+4), *(b+6)}; // b.r vector float cv = {*c, *(c+2), *(c+4), *(c+6)}; // c.r vector float dv = {*(a+1), *(a+3), *(a+5), *(a+7)}; // a.i vector float ev = {*(b+1), *(b+3), *(b+5), *(b+7)}; // b.i vector float fv = {*(c+1), *(c+3), *(c+5), *(c+7)}; // c.i vector float real = spu_nmsub(dv, ev, spu_madd(av, bv, cv)); // r.r vector float imag = spu_madd(dv, bv, spu_madd(av, ev, fv)); // r.i // interleave result *(vector float *)a = spu_shuffle(real, imag, lo); *(vector float *)(a+4) = spu_shuffle(real, imag, hi); } return 0; } case ZAM: { int length = p->length / 4; float *a_re = (float *)inout; float *a_im = a_re + 4 * length; float *b_re = a_re + 8 * length; float *b_im = a_re + 12 * length; float *c_re = a_re + 16 * length; float *c_im = a_re + 20 * length; unsigned int i; // (a + b) * c: // r.r = (a.r+b.r)*c.r - (a.i+b.i)*c.i // r.i = (a.r+b.r)*c.i + (a.i+b.i)*c.r for (i = 0; i != length; ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4) { vector float *av = (vector float *)a_re; vector float *bv = (vector float *)b_re; vector float *cv = (vector float *)c_re; vector float *dv = (vector float *)a_im; vector float *ev = (vector float *)b_im; vector float *fv = (vector float *)c_im; vector float trv = spu_add(*av, *bv); // a.r+b.r vector float tiv = spu_add(*dv, *ev); // a.i+b.i vector float sv = spu_mul(trv, *cv); // (a.r+b.r)*c.r vector float tv = spu_mul(trv, *fv); // (a.r+b.r)*c.i *av = spu_nmsub(tiv, *fv, sv); // r.r *dv = spu_madd(tiv, *cv, tv); // r.i } return 0; } case ZMA: { int length = p->length / 4; float *a_re = (float *)inout; float *a_im = a_re + 4 * length; float *b_re = a_re + 8 * length; float *b_im = a_re + 12 * length; float *c_re = a_re + 16 * length; float *c_im = a_re + 20 * length; unsigned int i; // a * b + c: // r.r = a.r*b.r + c.r - a.i*b.i // r.i = a.r*b.i + c.i + a.i*b.r for (i = 0; i != length; ++i, a_re+=4, b_re+=4, c_re+=4, a_im+=4, b_im+=4, c_im+=4) { vector float *av = (vector float *)a_re; vector float *bv = (vector float *)b_re; vector float *cv = (vector float *)c_re; vector float *dv = (vector float *)a_im; vector float *ev = (vector float *)b_im; vector float *fv = (vector float *)c_im; vector float tmp = spu_nmsub(*dv, *ev, spu_madd(*av, *bv, *cv)); *dv = spu_madd(*dv, *bv, spu_madd(*av, *ev, *fv)); *av = tmp; } return 0; } } return 1; }
/** * Setup fragment shader inputs by evaluating triangle's vertex * attribute coefficient info. * \param x quad x pos * \param y quad y pos * \param fragZ returns quad Z values * \param fragInputs returns fragment program inputs * Note: this code could be incorporated into the fragment program * itself to avoid the loop and switch. */ static void eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[]) { static const vector float deltaX = (const vector float) {0, 1, 0, 1}; static const vector float deltaY = (const vector float) {0, 0, 1, 1}; const uint posSlot = 0; const vector float pos = setup.coef[posSlot].a0; const vector float dposdx = setup.coef[posSlot].dadx; const vector float dposdy = setup.coef[posSlot].dady; const vector float fragX = spu_splats(x) + deltaX; const vector float fragY = spu_splats(y) + deltaY; vector float fragW, wInv; uint i; *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy); fragW = splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy); wInv = spu_re(fragW); /* 1 / w */ /* loop over fragment program inputs */ for (i = 0; i < spu.vertex_info.num_attribs; i++) { uint attr = i + 1; enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode; /* constant term */ vector float a0 = setup.coef[attr].a0; vector float r0 = splatx(a0); vector float r1 = splaty(a0); vector float r2 = splatz(a0); vector float r3 = splatw(a0); if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) { /* linear term */ vector float dadx = setup.coef[attr].dadx; vector float dady = setup.coef[attr].dady; /* Use SPU intrinsics here to get slightly better code. * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady); */ r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0)); r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1)); r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2)); r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3)); if (interp == INTERP_PERSPECTIVE) { /* perspective term */ r0 *= wInv; r1 *= wInv; r2 *= wInv; r3 *= wInv; } } fragInputs[CHAN0] = r0; fragInputs[CHAN1] = r1; fragInputs[CHAN2] = r2; fragInputs[CHAN3] = r3; fragInputs += 4; } } /** * Emit a quad (pass to next stage). No clipping is done. * Note: about 1/5 to 1/7 of the time, mask is zero and this function * should be skipped. But adding the test for that slows things down * overall. */ static INLINE void emit_quad( int x, int y, mask_t mask) { /* If any bits in mask are set... */ if (spu_extract(spu_orx(mask), 0)) { const int ix = x - setup.cliprect_minx; const int iy = y - setup.cliprect_miny; spu.cur_ctile_status = TILE_STATUS_DIRTY; spu.cur_ztile_status = TILE_STATUS_DIRTY; { /* * Run fragment shader, execute per-fragment ops, update fb/tile. */ vector float inputs[4*4], outputs[2*4]; vector unsigned int kill_mask; vector float fragZ; eval_inputs((float) x, (float) y, &fragZ, inputs); ASSERT(spu.fragment_program); ASSERT(spu.fragment_ops); /* Execute the current fragment program */ kill_mask = spu.fragment_program(inputs, outputs, spu.constants); mask = spu_andc(mask, kill_mask); /* Execute per-fragment/quad operations, including: * alpha test, z test, stencil test, blend and framebuffer writing. * Note that there are two different fragment operations functions * that can be called, one for front-facing fragments, and one * for back-facing fragments. (Often the two are the same; * but in some cases, like two-sided stenciling, they can be * very different.) So choose the correct function depending * on the calculated facing. */ spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile, fragZ, outputs[0*4+0], outputs[0*4+1], outputs[0*4+2], outputs[0*4+3], mask); } } } /** * Given an X or Y coordinate, return the block/quad coordinate that it * belongs to. */ static INLINE int block(int x) { return x & ~1; } /** * Render a horizontal span of quads */ static void flush_spans(void) { int minleft, maxright; const int l0 = spu_extract(setup.span.quad, 0); const int l1 = spu_extract(setup.span.quad, 1); const int r0 = spu_extract(setup.span.quad, 2); const int r1 = spu_extract(setup.span.quad, 3); switch (setup.span.y_flags) { case 0x3: /* both odd and even lines written (both quad rows) */ minleft = MIN2(l0, l1); maxright = MAX2(r0, r1); break; case 0x1: /* only even line written (quad top row) */ minleft = l0; maxright = r0; break; case 0x2: /* only odd line written (quad bottom row) */ minleft = l1; maxright = r1; break; default: return; } /* OK, we're very likely to need the tile data now. * clear or finish waiting if needed. */ if (spu.cur_ctile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ctile\n", spu.init.id); wait_on_mask(1 << TAG_READ_TILE_COLOR); spu.cur_ctile_status = TILE_STATUS_CLEAN; } else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) { //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty); clear_c_tile(&spu.ctile); spu.cur_ctile_status = TILE_STATUS_DIRTY; } ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); if (spu.read_depth_stencil) { if (spu.cur_ztile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ztile\n", spu.init.id); wait_on_mask(1 << TAG_READ_TILE_Z); spu.cur_ztile_status = TILE_STATUS_CLEAN; } else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) { //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty); clear_z_tile(&spu.ztile); spu.cur_ztile_status = TILE_STATUS_DIRTY; } ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED); } /* XXX this loop could be moved into the above switch cases... */ /* Setup for mask calculation */ const vec_int4 quad_LlRr = setup.span.quad; const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8); const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B)); const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B)); const vec_int4 twos = spu_splats(2); const int x = block(minleft); vec_int4 xs = {x, x+1, x, x+1}; for (; spu_extract(xs, 0) <= block(maxright); xs += twos) { /** * Computes mask to indicate which pixels in the 2x2 quad are actually * inside the triangle's bounds. */ /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */ const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs); const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */ const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs); /* Combine results to create mask */ const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs); emit_quad(spu_extract(xs, 0), setup.span.y, mask); } setup.span.y = 0; setup.span.y_flags = 0; /* Zero right elements */ setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0)); } #if DEBUG_VERTS static void print_vertex(const struct vertex_header *v) { uint i; fprintf(stderr, " Vertex: (%p)\n", v); for (i = 0; i < spu.vertex_info.num_attribs; i++) { fprintf(stderr, " %d: %f %f %f %f\n", i, spu_extract(v->data[i], 0), spu_extract(v->data[i], 1), spu_extract(v->data[i], 2), spu_extract(v->data[i], 3)); } }
void discretize(const uint32_t n, volatile vector real_t *conc_in, volatile vector real_t *wind, volatile vector real_t *diff, vector real_t *concbound, vector real_t *windbound, vector real_t *diffbound, vector real_t cell_size, vector real_t dt, volatile vector real_t *conc_out) { uint32_t i, x; vector real_t acc; vector real_t c[n]; vector real_t dcdx[n]; /* Copy original values */ i=0; x=n; while(x > 8) { c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; x -= 8; } while(x > 4) { c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; c[i] = conc_out[i] = conc_in[i]; ++i; x -= 4; } while(x > 0) { c[i] = conc_out[i] = conc_in[i]; ++i; --x; } space_advec_diff_v(n, conc_in, wind, diff, concbound, windbound, diffbound, cell_size, dcdx); i=0; x=n; while(x > 8) { c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; x -= 8; } while(x > 4) { c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; x -= 4; } while(x > 0) { c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; --x; } space_advec_diff_v(n, c, wind, diff, concbound, windbound, diffbound, cell_size, dcdx); i=0; x=n; while(x > 8) { c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; x -= 8; } while(x > 4) { c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; x -= 4; } while(x > 0) { c[i] = spu_madd(dt, dcdx[i], c[i]); ++i; --x; } #define UNROLL_ELEMENT \ acc = spu_add(conc_out[i], c[i]); \ conc_out[i] = spu_mul(HALF, acc); \ acc = spu_splats((real_t)0.0); \ acc = (vector real_t)spu_cmpgt(conc_out[i], acc); \ conc_out[i] = spu_and(conc_out[i], acc) i=0; x=n; while(x > 8) { UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; x -= 8; } while(x > 4) { UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; UNROLL_ELEMENT; ++i; x -= 4; } while(x > 0) { UNROLL_ELEMENT; ++i; --x; } #undef UNROLL_ELEMENT }
vector double __divv2df3 (vector double a_in, vector double b_in) { /* Variables */ vec_int4 exp, exp_bias; vec_uint4 no_underflow, overflow; vec_float4 mant_bf, inv_bf; vec_ullong2 exp_a, exp_b; vec_ullong2 a_nan, a_zero, a_inf, a_denorm, a_denorm0; vec_ullong2 b_nan, b_zero, b_inf, b_denorm, b_denorm0; vec_ullong2 nan; vec_uint4 a_exp, b_exp; vec_ullong2 a_mant_0, b_mant_0; vec_ullong2 a_exp_1s, b_exp_1s; vec_ullong2 sign_exp_mask; vec_double2 a, b; vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult; /* Constants */ vec_uint4 exp_mask_u32 = spu_splats((unsigned int)0x7FF00000); vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8, 9,10,11, 8,9,10,11 }; vec_uchar16 swap_32 = (vec_uchar16) { 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL); vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL); vec_float4 onef = spu_splats(1.0f); vec_double2 one = spu_splats(1.0); vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL); sign_exp_mask = spu_or(sign_mask, exp_mask); /* Extract the floating point components from each of the operands including * exponent and mantissa. */ a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32); a_exp = spu_shuffle(a_exp, a_exp, splat_hi); b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32); b_exp = spu_shuffle(b_exp, b_exp, splat_hi); a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0); a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32)); b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0); b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32)); a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32); b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32); /* Identify all possible special values that must be accommodated including: * +-denorm, +-0, +-infinity, and NaNs. */ a_denorm0= (vec_ullong2)spu_cmpeq(a_exp, 0); a_nan = spu_andc(a_exp_1s, a_mant_0); a_zero = spu_and (a_denorm0, a_mant_0); a_inf = spu_and (a_exp_1s, a_mant_0); a_denorm = spu_andc(a_denorm0, a_zero); b_denorm0= (vec_ullong2)spu_cmpeq(b_exp, 0); b_nan = spu_andc(b_exp_1s, b_mant_0); b_zero = spu_and (b_denorm0, b_mant_0); b_inf = spu_and (b_exp_1s, b_mant_0); b_denorm = spu_andc(b_denorm0, b_zero); /* Scale denorm inputs to into normalized numbers by conditionally scaling the * input parameters. */ a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask)); a = spu_sel(a_in, a, a_denorm); b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask)); b = spu_sel(b_in, b, b_denorm); /* Extract the divisor and dividend exponent and force parameters into the signed * range [1.0,2.0) or [-1.0,2.0). */ exp_a = spu_and((vec_ullong2)a, exp_mask); exp_b = spu_and((vec_ullong2)b, exp_mask); mant_a = spu_sel(a, one, (vec_ullong2)exp_mask); mant_b = spu_sel(b, one, (vec_ullong2)exp_mask); /* Approximate the single reciprocal of b by using * the single precision reciprocal estimate followed by one * single precision iteration of Newton-Raphson. */ mant_bf = spu_roundtf(mant_b); inv_bf = spu_re(mant_bf); inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf); /* Perform 2 more Newton-Raphson iterations in double precision. The * result (q1) is in the range (0.5, 2.0). */ inv_b = spu_extend(inv_bf); inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b); q0 = spu_mul(mant_a, inv_b); q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0); /* Determine the exponent correction factor that must be applied * to q1 by taking into account the exponent of the normalized inputs * and the scale factors that were applied to normalize them. */ exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20); exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34))); /* Bias the quotient exponent depending on the sign of the exponent correction * factor so that a single multiplier will ensure the entire double precision * domain (including denorms) can be achieved. * * exp bias q1 adjust exp * ===== ======== ========== * positive 2^+65 -65 * negative 2^-64 +64 */ exp_bias = spu_xor(spu_rlmaska(exp, -31), 64); exp = spu_sub(exp, exp_bias); q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask); /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the * expected result. On overflow, clamp the multiplier to the maximum non-infinite * number in case the rounding mode is not round-to-nearest. */ exp = spu_add(exp, 0x3FF); no_underflow = spu_cmpgt(exp, 0); overflow = spu_cmpgt(exp, 0x7FE); exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow); exp = spu_and(exp, (vec_int4)exp_mask); mult = spu_sel((vec_double2)exp, (vec_double2)(spu_add((vec_uint4)exp_mask, -1)), (vec_ullong2)overflow); /* Handle special value conditions. These include: * * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN * results. * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results. * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results. */ mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf)); mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero)); nan = spu_or(a_nan, b_nan); nan = spu_or(nan, spu_and(a_zero, b_zero)); nan = spu_or(nan, spu_and(a_inf, b_inf)); mult = spu_or(mult, (vec_double2)nan); /* Scale the final quotient */ q2 = spu_mul(q1, mult); return (q2); }
void process_buffer(int buffer, int cnt, vector float dt_v) { int i; volatile vector float *p_inv_mass_v; vector float force_v, inv_mass_v; vector float pos0, pos1, pos2, pos3; vector float vel0, vel1, vel2, vel3; vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3; vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7}; vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11}; vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15}; p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; force_v = ctx.force_v; // Compute the step in time for the block of particles, four // particle at a time. for (i=0; i<cnt; i+=4) { inv_mass_v = *p_inv_mass_v++; pos0 = pos[buffer][i+0]; pos1 = pos[buffer][i+1]; pos2 = pos[buffer][i+2]; pos3 = pos[buffer][i+3]; vel0 = vel[buffer][i+0]; vel1 = vel[buffer][i+1]; vel2 = vel[buffer][i+2]; vel3 = vel[buffer][i+3]; dt_inv_mass_v = spu_mul(dt_v, inv_mass_v); pos0 = spu_madd(vel0, dt_v, pos0); pos1 = spu_madd(vel1, dt_v, pos1); pos2 = spu_madd(vel2, dt_v, pos2); pos3 = spu_madd(vel3, dt_v, pos3); dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0); dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1); dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2); dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3); vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0); vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1); vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2); vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3); pos[buffer][i+0] = pos0; pos[buffer][i+1] = pos1; pos[buffer][i+2] = pos2; pos[buffer][i+3] = pos3; vel[buffer][i+0] = vel0; vel[buffer][i+1] = vel1; vel[buffer][i+2] = vel2; vel[buffer][i+3] = vel3; } } int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv) { int buffer, next_buffer; int cnt, next_cnt, left; float time, dt; vector float dt_v; volatile vector float *ctx_pos_v, *ctx_vel_v; volatile vector float *next_ctx_pos_v, *next_ctx_vel_v; volatile float *ctx_inv_mass, *next_ctx_inv_mass; unsigned int tags[2]; // Reserve a pair of DMA tag IDs tags[0] = mfc_tag_reserve(); tags[1] = mfc_tag_reserve(); // Input parameter argv is a pointer to the particle context. // Fetch the parameter context, waiting for it to complete. spu_writech(MFC_WrTagMask, 1 << tags[0]); spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); dt = ctx.dt; dt_v = spu_splats(dt); // For each step in time for (time=0; time<END_OF_TIME; time += dt) { // For each double buffered block of particles left = ctx.particles; cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; ctx_pos_v = ctx.pos_v; ctx_vel_v = ctx.vel_v; ctx_inv_mass = ctx.inv_mass; // Prefetch first buffer of input data. buffer = 0; spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD); spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD); while (cnt < left) { left -= cnt; next_ctx_pos_v = ctx_pos_v + cnt; next_ctx_vel_v = ctx_vel_v + cnt; next_ctx_inv_mass = ctx_inv_mass + cnt; next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; // Prefetch next buffer so the data is available for computation on next loop iteration. // The first DMA is barriered so that we don't GET data before the previous iteration's // data is PUT. next_buffer = buffer^1; spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD); spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD); spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD); // Wait for previously prefetched data spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); process_buffer(buffer, cnt, dt_v); // Put the buffer's position and velocity data back into system memory spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); ctx_pos_v = next_ctx_pos_v; ctx_vel_v = next_ctx_vel_v; ctx_inv_mass = next_ctx_inv_mass; buffer = next_buffer; cnt = next_cnt; } // Wait for previously prefetched data spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); process_buffer(buffer, cnt, dt_v); // Put the buffer's position and velocity data back into system memory spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); // Wait for DMAs to complete before starting the next step in time. spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); } return (0); }
inline void calc(ELEM_TYPE* A, ELEM_TYPE* B, ELEM_TYPE* C) { //register WRRecord* wrRecord = (WRRecord*)C; //register int startRow = wrRecord->startRow; //register int startCol = wrRecord->startCol; // DEBUG //printf("SPE_%d :: startRow = %d, startCol = %d...\n", (int)getSPEID(), startRow, startCol); // DEBUG //printf("SPE_%d :: A = %p, B = %p, C = %p...\n", (int)getSPEID(), A, B, C); register int r,c; // Fill in C for (r = 0; r < NUM_ROWS_PER_WR; r++) { for (c = 0; c < NUM_COLS_PER_WR; c++) { // Init the pointers register vector ELEM_TYPE* APtr = (vector ELEM_TYPE*)(A + (r * MATRIX_A_COLS)); register vector ELEM_TYPE* BPtr = (vector ELEM_TYPE*)(B + (c * MATRIX_B_ROWS)); #if USE_DOUBLE == 0 register vector ELEM_TYPE sumV = { 0.0f, 0.0f, 0.0f, 0.0f }; #else register vector ELEM_TYPE sumV = { 0.0, 0.0 }; #endif //// DEBUG //printf("SPE_%d :: Start C value [%d x %d]... APtr = %p, BPtr = %p\n", (int)getSPEID(), r, c, APtr, BPtr); //{ // register int i; // printf("SPE_%d :: A's Row = { ", (int)getSPEID()); // for (i = 0; i < MATRIX_A_COLS; i++) printf("%f ", (double)*(((float*)(APtr)) + i)); // printf("}...\n"); // printf("SPE_%d :: B's Column = { ", (int)getSPEID()); // for (i = 0; i < MATRIX_B_ROWS; i++) printf("%f ", (double)*(((float*)(BPtr)) + i)); // printf("}...\n"); //} register int i; for (i = 0; i < MATRIX_A_COLS; i += (16 / sizeof(ELEM_TYPE))) { register vector ELEM_TYPE aV = *APtr; register vector ELEM_TYPE bV = *BPtr; // DEBUG //printf("SPE :: aV = { %f, %f, %f, %f }\n", spu_extract(aV, 0), spu_extract(aV, 1), spu_extract(aV, 2), spu_extract(aV, 3)); //printf("SPE :: bV = { %f, %f, %f, %f }\n", spu_extract(bV, 0), spu_extract(bV, 1), spu_extract(bV, 2), spu_extract(bV, 3)); APtr += 1; BPtr += 1; sumV = spu_madd(aV, bV, sumV); // DEBUG //printf("SPE :: sumV = { %f, %f, %f, %f }\n", spu_extract(sumV, 0), spu_extract(sumV, 1), spu_extract(sumV, 2), spu_extract(sumV, 3)); } // Add the elements of the sumV vector together #if USE_DOUBLE == 0 register ELEM_TYPE sum = 0.0f; sum += spu_extract(sumV, 0); sum += spu_extract(sumV, 1); sum += spu_extract(sumV, 2); sum += spu_extract(sumV, 3); #else register ELEM_TYPE sum = 0.0; sum += spu_extract(sumV, 0); sum += spu_extract(sumV, 1); #endif // Store in C C[c + (r * NUM_COLS_PER_WR)] = sum; // DEBUG //printf("SPE_%d :: C value [%d x %d] = %f\n", (int)getSPEID(), r, c, sum); } } }
int main( unsigned long long spe_id, unsigned long long ppu_vector_a, unsigned long long ppu_vector_b) { int i, iter, buf_idx, vec_idx; unsigned long long ppu_vector_bases[2] _ALIG(128); vector float * pchunk_a, * pchunk_b; vector float g_vec = {0,0,0,0}; ppu_vector_bases[0] = ppu_vector_a; ppu_vector_bases[1] = ppu_vector_b; const unsigned int spu_num = spu_read_in_mbox(); unsigned long long get_edge_bytes = spu_num * SUBVEC_SZ_BYTES; float buffers[NBUFFERS * BUF_SZ_FLOATS] _ALIG(128); int buffer_tags[NBUFFERS][2] _ALIG(128); //int buffer_tags[NBUFFERS]; for (iter = 0; iter < NBUFFERS; ++iter) { buffer_tags[iter][0] = mfc_tag_reserve(); buffer_tags[iter][1] = mfc_tag_reserve(); } // first mfc_get for all for (buf_idx = 0; buf_idx < NBUFFERS; ++buf_idx) { for (vec_idx = 0; vec_idx < 2; ++vec_idx) { mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx), ppu_vector_bases[vec_idx] + get_edge_bytes, CHUNK_SZ_BYTES, buffer_tags[buf_idx][vec_idx], 0, 0); } } get_edge_bytes += CHUNK_SZ_BYTES; //printf("subvec_sz-chunks: %d\n", SUBVEC_SZ_CHUNKS); //printf("%d==%d\n", MAXITER*NBUFFERS*CHUNK_SZ_FLOATS, SUBVEC_SZ_FLOATS); int chunksleft = SUBVEC_SZ_CHUNKS; while(chunksleft!=0) { for (buf_idx = 0; chunksleft !=0 && buf_idx < NBUFFERS; ++buf_idx) { const int tag_mask = (1 << buffer_tags[buf_idx][0]) | (1 << buffer_tags[buf_idx][1]); mfc_write_tag_mask(tag_mask); mfc_read_tag_status_all(); pchunk_a = buf_ptr_vecfloat(buffers, buf_idx, 0); pchunk_b = buf_ptr_vecfloat(buffers, buf_idx, 1); for (i = 0; i < CHUNK_SZ_FLOATVECS; ++i) { g_vec = spu_madd(pchunk_a[i], pchunk_b[i], g_vec); } // move this mfc_get to end of loop, check get_edge_bytes variable dynamics if (likely(iter != MAXITER - 1)) { for (vec_idx = 0; vec_idx < 2; ++vec_idx) { mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx), ppu_vector_bases[vec_idx] + get_edge_bytes, CHUNK_SZ_BYTES, buffer_tags[buf_idx][vec_idx], 0, 0); } } get_edge_bytes += CHUNK_SZ_BYTES; --chunksleft; } } for (iter = 0; iter < NBUFFERS; ++iter) { mfc_tag_release(buffer_tags[iter][0]); mfc_tag_release(buffer_tags[iter][1]); } float_uint_t retval; retval.f = spu_extract(g_vec, 0) + spu_extract(g_vec, 1) + spu_extract(g_vec, 2) + spu_extract(g_vec, 3); //printf("retval: %f\n", retval.f); spu_write_out_mbox(retval.i); return 0; }